Zoltan2
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
Zoltan2_AlgMultiJagged.hpp
Go to the documentation of this file.
1 // @HEADER
2 //
3 // ***********************************************************************
4 //
5 // Zoltan2: A package of combinatorial algorithms for scientific computing
6 // Copyright 2012 Sandia Corporation
7 //
8 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
9 // the U.S. Government retains certain rights in this software.
10 //
11 // Redistribution and use in source and binary forms, with or without
12 // modification, are permitted provided that the following conditions are
13 // met:
14 //
15 // 1. Redistributions of source code must retain the above copyright
16 // notice, this list of conditions and the following disclaimer.
17 //
18 // 2. Redistributions in binary form must reproduce the above copyright
19 // notice, this list of conditions and the following disclaimer in the
20 // documentation and/or other materials provided with the distribution.
21 //
22 // 3. Neither the name of the Corporation nor the names of the
23 // contributors may be used to endorse or promote products derived from
24 // this software without specific prior written permission.
25 //
26 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
27 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
30 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 //
38 // Questions? Contact Karen Devine (kddevin@sandia.gov)
39 // Erik Boman (egboman@sandia.gov)
40 // Siva Rajamanickam (srajama@sandia.gov)
41 //
42 // ***********************************************************************
43 //
44 // @HEADER
49 #ifndef _ZOLTAN2_ALGMultiJagged_HPP_
50 #define _ZOLTAN2_ALGMultiJagged_HPP_
51 
54 #include <Zoltan2_Parameters.hpp>
55 #include <Zoltan2_Algorithm.hpp>
57 #include <Teuchos_StandardParameterEntryValidators.hpp>
58 
59 #include <Tpetra_Distributor.hpp>
60 #include <Teuchos_ParameterList.hpp>
62 #include <new> // ::operator new[]
63 #include <algorithm> // std::sort
64 #include <Zoltan2_Util.hpp>
65 #include <vector>
66 
67 #if defined(__cplusplus) && __cplusplus >= 201103L
68 #include <unordered_map>
69 #else
70 #include <Teuchos_Hashtable.hpp>
71 #endif // C++11 is enabled
72 
73 #ifdef ZOLTAN2_USEZOLTANCOMM
74 #ifdef HAVE_ZOLTAN2_MPI
75 #define ENABLE_ZOLTAN_MIGRATION
76 #include "zoltan_comm_cpp.h"
77 #include "zoltan_types.h" // for error codes
78 #endif
79 #endif
80 
81 #ifdef HAVE_ZOLTAN2_OMP
82 #include <omp.h>
83 #endif
84 
85 #define LEAST_SIGNIFICANCE 0.0001
86 #define SIGNIFICANCE_MUL 1000
87 
88 //if the (last dimension reduce all count) x the mpi world size
89 //estimated to be bigger than this number then migration will be forced
90 //in earlier iterations.
91 #define FUTURE_REDUCEALL_CUTOFF 1500000
92 //if parts right before last dimension are estimated to have less than
93 //MIN_WORK_LAST_DIM many coords, migration will be forced in earlier iterations.
94 #define MIN_WORK_LAST_DIM 1000
95 
96 
97 
98 
99 #define ZOLTAN2_ABS(x) ((x) >= 0 ? (x) : -(x))
100 //imbalance calculation. Wreal / Wexpected - 1
101 #define imbalanceOf(Wachieved, totalW, expectedRatio) \
102  (Wachieved) / ((totalW) * (expectedRatio)) - 1
103 #define imbalanceOf2(Wachieved, wExpected) \
104  (Wachieved) / (wExpected) - 1
105 
106 
107 #define ZOLTAN2_ALGMULTIJAGGED_SWAP(a,b,temp) temp=(a);(a)=(b);(b)=temp;
108 
109 
110 namespace Teuchos{
111 
116 template <typename Ordinal, typename T>
117 class Zoltan2_BoxBoundaries : public ValueTypeReductionOp<Ordinal,T>
118 {
119 private:
120  Ordinal size;
121  T _EPSILON;
122 
123 public:
126  Zoltan2_BoxBoundaries ():size(0), _EPSILON (std::numeric_limits<T>::epsilon()){}
127 
134  Zoltan2_BoxBoundaries (Ordinal s_):
135  size(s_), _EPSILON (std::numeric_limits<T>::epsilon()){}
136 
139  void reduce( const Ordinal count, const T inBuffer[], T inoutBuffer[]) const
140  {
141  for (Ordinal i=0; i < count; i++){
142  if (Z2_ABS(inBuffer[i]) > _EPSILON){
143  inoutBuffer[i] = inBuffer[i];
144  }
145  }
146  }
147 };
148 } // namespace Teuchos
149 
150 namespace Zoltan2{
151 
155 template <typename T>
156 T *allocMemory(size_t size){
157  if (size > 0){
158  T * a = new T[size];
159  if (a == NULL) {
160  throw "cannot allocate memory";
161  }
162  return a;
163  }
164  else {
165  return NULL;
166  }
167 }
168 
172 template <typename T>
173 void freeArray(T *&array){
174  if(array != NULL){
175  delete [] array;
176  array = NULL;
177  }
178 }
179 
180 
188 template <typename IT, typename CT, typename WT>
190 {
191 public:
192  //TODO: Why volatile?
193  //no idea, another intel compiler faiulure.
194  volatile IT index;
195  volatile CT count;
196  //unsigned int val;
197  volatile WT *val;
198  volatile WT _EPSILON;
199 
201  this->index = 0;
202  this->count = 0;
203  this->val = NULL;
205  }
206 
207 
208  uMultiSortItem(IT index_ ,CT count_, WT *vals_){
209  this->index = index_;
210  this->count = count_;
211  this->val = vals_;
213  }
214 
216  this->index = other.index;
217  this->count = other.count;
218  this->val = other.val;
219  this->_EPSILON = other._EPSILON;
220  }
221 
223  //freeArray<WT>(this->val);
224  }
225 
226  void set(IT index_ ,CT count_, WT *vals_){
227  this->index = index_;
228  this->count = count_;
229  this->val = vals_;
230  }
231 
232 
234  this->index = other.index;
235  this->count = other.count;
236  this->val = other.val;
237  return *(this);
238  }
239 
240  bool operator<(const uMultiSortItem<IT,CT,WT>& other) const{
241  assert (this->count == other.count);
242  for(CT i = 0; i < this->count; ++i){
243  //if the values are equal go to next one.
244  if (ZOLTAN2_ABS(this->val[i] - other.val[i]) < this->_EPSILON){
245  continue;
246  }
247  //if next value is smaller return true;
248  if(this->val[i] < other.val[i]){
249  return true;
250  }
251  //if next value is bigger return false;
252  else {
253  return false;
254  }
255  }
256  //if they are totally equal.
257  return this->index < other.index;
258  }
259  bool operator>(const uMultiSortItem<IT,CT,WT>& other) const{
260  assert (this->count == other.count);
261  for(CT i = 0; i < this->count; ++i){
262  //if the values are equal go to next one.
263  if (ZOLTAN2_ABS(this->val[i] - other.val[i]) < this->_EPSILON){
264  continue;
265  }
266  //if next value is bigger return true;
267  if(this->val[i] > other.val[i]){
268  return true;
269  }
270  //if next value is smaller return false;
271  else //(this->val[i] > other.val[i])
272  {
273  return false;
274  }
275  }
276  //if they are totally equal.
277  return this->index > other.index;
278  }
279 };// uSortItem;
280 
284 template <class IT, class WT>
285 struct uSortItem
286 {
287  IT id;
288  //unsigned int val;
289  WT val;
290 };// uSortItem;
291 
295 template <class IT, class WT>
296 void uqsort(IT n, uSortItem<IT, WT> * arr)
297 {
298 
299  int NSTACK = 50;
300  int M = 7;
301  IT i, ir=n, j, k, l=1;
302  IT jstack=0, istack[50];
303  WT aval;
304  uSortItem<IT,WT> a, temp;
305 
306  --arr;
307  for (;;)
308  {
309  if (ir-l < M)
310  {
311  for (j=l+1;j<=ir;j++)
312  {
313  a=arr[j];
314  aval = a.val;
315  for (i=j-1;i>=1;i--)
316  {
317  if (arr[i].val <= aval)
318  break;
319  arr[i+1] = arr[i];
320  }
321  arr[i+1]=a;
322  }
323  if (jstack == 0)
324  break;
325  ir=istack[jstack--];
326  l=istack[jstack--];
327  }
328  else
329  {
330  k=(l+ir) >> 1;
331 
332  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[k],arr[l+1], temp)
333  if (arr[l+1].val > arr[ir].val)
334  {
335  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[l+1],arr[ir],temp)
336  }
337  if (arr[l].val > arr[ir].val)
338  {
339  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[l],arr[ir],temp)
340  }
341  if (arr[l+1].val > arr[l].val)
342  {
343  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[l+1],arr[l],temp)
344  }
345  i=l+1;
346  j=ir;
347  a=arr[l];
348  aval = a.val;
349  for (;;)
350  {
351  do i++; while (arr[i].val < aval);
352  do j--; while (arr[j].val > aval);
353  if (j < i) break;
354  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[i],arr[j],temp);
355  }
356  arr[l]=arr[j];
357  arr[j]=a;
358  jstack += 2;
359  if (jstack > NSTACK){
360  std::cout << "uqsort: NSTACK too small in sort." << std::endl;
361  exit(1);
362  }
363  if (ir-i+1 >= j-l)
364  {
365  istack[jstack]=ir;
366  istack[jstack-1]=i;
367  ir=j-1;
368  }
369  else
370  {
371  istack[jstack]=j-1;
372  istack[jstack-1]=l;
373  l=i;
374  }
375  }
376  }
377 }
378 
379 template <class IT, class WT, class SIGN>
381 {
382  IT id;
383  //unsigned int val;
384  WT val;
385  SIGN signbit; // 1 means positive, 0 means negative.
386  bool operator<(const uSignedSortItem<IT, WT, SIGN>& rhs) const {
387  /*if I am negative, the other is positive*/
388  if (this->signbit < rhs.signbit){
389  return true;
390  }
391  /*if both has the same sign*/
392  else if (this->signbit == rhs.signbit){
393 
394  if (this->val < rhs.val){//if my value is smaller,
395  return this->signbit;//then if we both are positive return true.
396  //if we both are negative, return false.
397  }
398  else if (this->val > rhs.val){//if my value is larger,
399  return !this->signbit; //then if we both are positive return false.
400  //if we both are negative, return true.
401  }
402  else { //if both are equal.
403  return false;
404  }
405  }
406  else {
407  /*if I am positive, the other is negative*/
408  return false;
409  }
410 
411  }
412  bool operator>(const uSignedSortItem<IT, WT, SIGN>& rhs) const {
413  /*if I am positive, the other is negative*/
414  if (this->signbit > rhs.signbit){
415  return true;
416  }
417  /*if both has the same sign*/
418  else if (this->signbit == rhs.signbit){
419 
420  if (this->val < rhs.val){//if my value is smaller,
421  return !this->signbit;//then if we both are positive return false.
422  //if we both are negative, return true.
423  }
424  else if (this->val > rhs.val){//if my value is larger,
425  return this->signbit; //then if we both are positive return true.
426  //if we both are negative, return false.
427  }
428  else { // if they are equal
429  return false;
430  }
431  }
432  else {
433  /*if I am negative, the other is positive*/
434  return false;
435  }
436  }
437  bool operator<=(const uSignedSortItem<IT, WT, SIGN>& rhs){
438  return !(*this > rhs);}
440  return !(*this < rhs);}
441 };
442 
446 template <class IT, class WT, class SIGN>
448 
449  IT NSTACK = 50;
450  IT M = 7;
451  IT i, ir=n, j, k, l=1;
452  IT jstack=0, istack[50];
454 
455  --arr;
456  for (;;)
457  {
458  if (ir < M + l)
459  {
460  for (j=l+1;j<=ir;j++)
461  {
462  a=arr[j];
463  for (i=j-1;i>=1;i--)
464  {
465  if (arr[i] <= a)
466  {
467  break;
468  }
469  arr[i+1] = arr[i];
470  }
471  arr[i+1]=a;
472  }
473  if (jstack == 0)
474  break;
475  ir=istack[jstack--];
476  l=istack[jstack--];
477  }
478  else
479  {
480  k=(l+ir) >> 1;
481  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[k],arr[l+1], temp)
482  if (arr[l+1] > arr[ir])
483  {
484  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[l+1],arr[ir],temp)
485  }
486  if (arr[l] > arr[ir])
487  {
488  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[l],arr[ir],temp)
489  }
490  if (arr[l+1] > arr[l])
491  {
492  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[l+1],arr[l],temp)
493  }
494  i=l+1;
495  j=ir;
496  a=arr[l];
497  for (;;)
498  {
499  do i++; while (arr[i] < a);
500  do j--; while (arr[j] > a);
501  if (j < i) break;
502  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[i],arr[j],temp);
503  }
504  arr[l]=arr[j];
505  arr[j]=a;
506  jstack += 2;
507  if (jstack > NSTACK){
508  std::cout << "uqsort: NSTACK too small in sort." << std::endl;
509  exit(1);
510  }
511  if (ir+l+1 >= j+i)
512  {
513  istack[jstack]=ir;
514  istack[jstack-1]=i;
515  ir=j-1;
516  }
517  else
518  {
519  istack[jstack]=j-1;
520  istack[jstack-1]=l;
521  l=i;
522  }
523  }
524  }
525 }
526 
530 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
531  typename mj_part_t>
532 class AlgMJ
533 {
534 private:
536  typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
537 
538  RCP<const Environment> mj_env; //the environment object
539  RCP<const Comm<int> > mj_problemComm; //initial comm object
540 
541  double imbalance_tolerance; //input imbalance tolerance.
542  mj_part_t *part_no_array; //input part array specifying num part to divide along each dim.
543  int recursion_depth; //the number of steps that partitioning will be solved in.
544  int coord_dim, num_weights_per_coord; //coordinate dim and # of weights per coord
545 
546  size_t initial_num_loc_coords; //initial num local coords.
547  global_size_t initial_num_glob_coords; //initial num global coords.
548 
549  mj_lno_t num_local_coords; //number of local coords.
550  mj_gno_t num_global_coords; //number of global coords.
551 
552  mj_scalar_t **mj_coordinates; //two dimension coordinate array
553  mj_scalar_t **mj_weights; //two dimension weight array
554  bool *mj_uniform_parts; //if the target parts are uniform
555  mj_scalar_t **mj_part_sizes; //target part weight sizes.
556  bool *mj_uniform_weights; //if the coordinates have uniform weights.
557 
558  ArrayView<const mj_gno_t> mj_gnos; //global ids of the coordinates, comes from the input
559  size_t num_global_parts; //the targeted number of parts
560 
561  mj_gno_t *initial_mj_gnos; //initial global ids of the coordinates.
562  mj_gno_t *current_mj_gnos; //current global ids of the coordinates, might change during migration.
563  int *owner_of_coordinate; //the actual processor owner of the coordinate, to track after migrations.
564 
565  mj_lno_t *coordinate_permutations; //permutation of coordinates, for partitioning.
566  mj_lno_t *new_coordinate_permutations; //permutation work array.
567  mj_part_t *assigned_part_ids; //the part ids assigned to coordinates.
568 
569  mj_lno_t *part_xadj; //beginning and end of each part.
570  mj_lno_t *new_part_xadj; // work array for beginning and end of each part.
571 
572  //get mj specific parameters.
573  bool distribute_points_on_cut_lines; //if partitioning can distribute points on same coordiante to different parts.
574  mj_part_t max_concurrent_part_calculation; // how many parts we can calculate concurrently.
575 
576  bool mj_run_as_rcb; //if this is set, then recursion depth is adjusted to its maximum value.
577  int mj_user_recursion_depth; //the recursion depth value provided by user.
578  bool mj_keep_part_boxes; //if the boxes need to be kept.
579 
580  int check_migrate_avoid_migration_option; //whether to migrate=1, avoid migrate=2, or leave decision to MJ=0
581  int migration_type; // when doing the migration, 0 will aim for perfect load-imbalance,
582  //1 - will aim for minimized number of messages with possibly bad load-imbalance
583  double minimum_migration_imbalance; //when MJ decides whether to migrate, the minimum imbalance for migration.
584  int num_threads; //num threads
585 
586  // Nonuniform first level partitioning (Currently available only for sequential_task_partitioning):
587  // Used for Dragonfly task mapping by partitioning Dragonfly RCA
588  // machine coordinates and application coordinates.
589  // An optimization that completely partitions the most important machine dimension
590  // first (i.e. the Dragonfly group coordinate, or RCA's x coordinate). The standard
591  // MJ alg follows after the nonuniform first level partitioning.
592  //
593  // Ex. (first level partitioning): If we have 120 elements,
594  // num_first_level_parts = 3, first_level_distribution = [4, 10, 6], then
595  // part sizes after first level will be [24, 60, 36]. Standard uniform MJ
596  // continues for all subsequent levels.
597  mj_part_t num_first_level_parts; // If used, number of parts requested for a nonuniform first level partitioning
598  const mj_part_t *first_level_distribution; // If used, the requested distribution of parts for the nonuniform first level partitioning
599 
600  mj_part_t total_num_cut ; //how many cuts will be totally
601  mj_part_t total_num_part; //how many parts will be totally
602 
603  mj_part_t max_num_part_along_dim ; //maximum part count along a dimension.
604  mj_part_t max_num_cut_along_dim; //maximum cut count along a dimension.
605  size_t max_num_total_part_along_dim; //maximum part+cut count along a dimension.
606 
607  mj_part_t total_dim_num_reduce_all; //estimate on #reduceAlls can be done.
608  mj_part_t last_dim_num_part; //max no of parts that might occur
609  //during the partition before the
610  //last partitioning dimension.
611 
612  RCP<Comm<int> > comm; //comm object than can be altered during execution
613  float fEpsilon; //epsilon for float
614  mj_scalar_t sEpsilon; //epsilon for mj_scalar_t
615 
616  mj_scalar_t maxScalar_t; //max possible scalar
617  mj_scalar_t minScalar_t; //min scalar
618 
619  mj_scalar_t *all_cut_coordinates;
620  mj_scalar_t *max_min_coords;
621  mj_scalar_t *process_cut_line_weight_to_put_left; //how much weight should a MPI put left side of the each cutline
622  mj_scalar_t **thread_cut_line_weight_to_put_left; //how much weight percentage should each thread in MPI put left side of the each outline
623 
624  // work array to manipulate coordinate of cutlines in different iterations.
625  //necessary because previous cut line information is used for determining
626  //the next cutline information. therefore, cannot update the cut work array
627  //until all cutlines are determined.
628  mj_scalar_t *cut_coordinates_work_array;
629 
630  //cumulative part weight array.
631  mj_scalar_t *target_part_weights;
632 
633  mj_scalar_t *cut_upper_bound_coordinates ; //upper bound coordinate of a cut line
634  mj_scalar_t *cut_lower_bound_coordinates ; //lower bound coordinate of a cut line
635  mj_scalar_t *cut_lower_bound_weights ; //lower bound weight of a cut line
636  mj_scalar_t *cut_upper_bound_weights ; //upper bound weight of a cut line
637 
638  mj_scalar_t *process_local_min_max_coord_total_weight ; //combined array to exchange the min and max coordinate, and total weight of part.
639  mj_scalar_t *global_min_max_coord_total_weight ;//global combined array with the results for min, max and total weight.
640 
641  //isDone is used to determine if a cutline is determined already.
642  //If a cut line is already determined, the next iterations will skip this cut line.
643  bool *is_cut_line_determined;
644  //my_incomplete_cut_count count holds the number of cutlines that have not been finalized for each part
645  //when concurrentPartCount>1, using this information, if my_incomplete_cut_count[x]==0, then no work is done for this part.
646  mj_part_t *my_incomplete_cut_count;
647  //local part weights of each thread.
648  double **thread_part_weights;
649  //the work manupulation array for partweights.
650  double **thread_part_weight_work;
651 
652  //thread_cut_left_closest_point to hold the closest coordinate to a cutline from left (for each thread).
653  mj_scalar_t **thread_cut_left_closest_point;
654  //thread_cut_right_closest_point to hold the closest coordinate to a cutline from right (for each thread)
655  mj_scalar_t **thread_cut_right_closest_point;
656 
657  //to store how many points in each part a thread has.
658  mj_lno_t **thread_point_counts;
659 
660  mj_scalar_t *process_rectilinear_cut_weight;
661  mj_scalar_t *global_rectilinear_cut_weight;
662 
663  //for faster communication, concatanation of
664  //totalPartWeights sized 2P-1, since there are P parts and P-1 cut lines
665  //leftClosest distances sized P-1, since P-1 cut lines
666  //rightClosest distances size P-1, since P-1 cut lines.
667  mj_scalar_t *total_part_weight_left_right_closests ;
668  mj_scalar_t *global_total_part_weight_left_right_closests;
669 
670  RCP<mj_partBoxVector_t> kept_boxes; // vector of all boxes for all parts;
671  // constructed only if
672  // mj_keep_part_boxes == true
673  RCP<mj_partBox_t> global_box;
674  int myRank, myActualRank; //processor rank, and initial rank
675 
676  bool divide_to_prime_first;
677 
678  /* \brief Either the mj array (part_no_array) or num_global_parts should be provided in
679  * the input. part_no_array takes
680  * precedence if both are provided.
681  * Depending on these parameters, total cut/part number,
682  * maximum part/cut number along a dimension, estimated number of reduceAlls,
683  * and the number of parts before the last dimension is calculated.
684  * */
685  void set_part_specifications();
686 
687  /* \brief Tries to determine the part number for current dimension,
688  * by trying to make the partitioning as square as possible.
689  * \param num_total_future how many more partitionings are required.
690  * \param root how many more recursion depth is left.
691  */
692  inline mj_part_t get_part_count(
693  mj_part_t num_total_future,
694  double root);
695 
696  /* \brief Allocates the all required memory for the mj partitioning algorithm.
697  *
698  */
699  void allocate_set_work_memory();
700 
701  /* \brief for part communication we keep track of the box boundaries.
702  * This is performed when either asked specifically, or when geometric mapping is performed afterwards.
703  * This function initializes a single box with all global min and max coordinates.
704  * \param initial_partitioning_boxes the input and output vector for boxes.
705  */
706  void init_part_boxes(RCP<mj_partBoxVector_t> & outPartBoxes);
707 
708  /* \brief compute global bounding box: min/max coords of global domain */
709  void compute_global_box();
710 
711  /* \brief Function returns how many parts that will be obtained after this dimension partitioning.
712  * It sets how many parts each current part will be partitioned into in this dimension to num_partitioning_in_current_dim vector,
713  * sets how many total future parts each obtained part will be partitioned into in next_future_num_parts_in_parts vector,
714  * If part boxes are kept, then sets initializes the output_part_boxes as its ancestor.
715  *
716  * \param num_partitioning_in_current_dim: output. How many parts each current part will be partitioned into.
717  * \param future_num_part_in_parts: input, how many future parts each current part will be partitioned into.
718  * \param next_future_num_parts_in_parts: output, how many future parts each obtained part will be partitioned into.
719  * \param future_num_parts: output, max number of future parts that will be obtained from a single
720  * \param current_num_parts: input, how many parts are there currently.
721  * \param current_iteration: input, current dimension iteration number.
722  * \param input_part_boxes: input, if boxes are kept, current boxes.
723  * \param output_part_boxes: output, if boxes are kept, the initial box boundaries for obtained parts.
724  */
725  mj_part_t update_part_num_arrays(
726  std::vector<mj_part_t> &num_partitioning_in_current_dim, //assumes this vector is empty.
727  std::vector<mj_part_t> *future_num_part_in_parts,
728  std::vector<mj_part_t> *next_future_num_parts_in_parts, //assumes this vector is empty.
729  mj_part_t &future_num_parts,
730  mj_part_t current_num_parts,
731  int current_iteration,
732  RCP<mj_partBoxVector_t> input_part_boxes,
733  RCP<mj_partBoxVector_t> output_part_boxes,
734  mj_part_t atomic_part_count);
735 
747  void mj_get_local_min_max_coord_totW(
748  mj_lno_t coordinate_begin_index,
749  mj_lno_t coordinate_end_index,
750  mj_lno_t *mj_current_coordinate_permutations,
751  mj_scalar_t *mj_current_dim_coords,
752  mj_scalar_t &min_coordinate,
753  mj_scalar_t &max_coordinate,
754  mj_scalar_t &total_weight);
755 
763  void mj_get_global_min_max_coord_totW(
764  mj_part_t current_concurrent_num_parts,
765  mj_scalar_t *local_min_max_total,
766  mj_scalar_t *global_min_max_total);
767 
795  void mj_get_initial_cut_coords_target_weights(
796  mj_scalar_t min_coord,
797  mj_scalar_t max_coord,
798  mj_part_t num_cuts/*p-1*/ ,
799  mj_scalar_t global_weight,
800  mj_scalar_t *initial_cut_coords /*p - 1 sized, coordinate of each cut line*/,
801  mj_scalar_t *target_part_weights /*cumulative weights, at left side of each cut line. p-1 sized*/,
802 
803  std::vector <mj_part_t> *future_num_part_in_parts, //the vecto
804  std::vector <mj_part_t> *next_future_num_parts_in_parts,
805  mj_part_t concurrent_current_part,
806  mj_part_t obtained_part_index,
807  mj_part_t num_target_first_level_parts = 1,
808  const mj_part_t *target_first_level_dist = NULL);
809 
822  void set_initial_coordinate_parts(
823  mj_scalar_t &max_coordinate,
824  mj_scalar_t &min_coordinate,
825  mj_part_t &concurrent_current_part_index,
826  mj_lno_t coordinate_begin_index,
827  mj_lno_t coordinate_end_index,
828  mj_lno_t *mj_current_coordinate_permutations,
829  mj_scalar_t *mj_current_dim_coords,
830  mj_part_t *mj_part_ids,
831  mj_part_t &partition_count);
832 
843  void mj_1D_part(
844  mj_scalar_t *mj_current_dim_coords,
845  double imbalanceTolerance,
846  mj_part_t current_work_part,
847  mj_part_t current_concurrent_num_parts,
848  mj_scalar_t *current_cut_coordinates,
849  mj_part_t total_incomplete_cut_count,
850  std::vector <mj_part_t> &num_partitioning_in_current_dim);
851 
871  void mj_1D_part_get_thread_part_weights(
872  size_t total_part_count,
873  mj_part_t num_cuts,
874  mj_scalar_t max_coord,
875  mj_scalar_t min_coord,
876  mj_lno_t coordinate_begin_index,
877  mj_lno_t coordinate_end_index,
878  mj_scalar_t *mj_current_dim_coords,
879  mj_scalar_t *temp_current_cut_coords,
880  bool *current_cut_status,
881  double *my_current_part_weights,
882  mj_scalar_t *my_current_left_closest,
883  mj_scalar_t *my_current_right_closest);
884 
892  void mj_accumulate_thread_results(
893  const std::vector <mj_part_t> &num_partitioning_in_current_dim,
894  mj_part_t current_work_part,
895  mj_part_t current_concurrent_num_parts);
896 
927  void mj_get_new_cut_coordinates(
928  const size_t &num_total_part,
929  const mj_part_t &num_cuts,
930  const mj_scalar_t &max_coordinate,
931  const mj_scalar_t &min_coordinate,
932  const mj_scalar_t &global_total_weight,
933  const double &used_imbalance_tolerance,
934  mj_scalar_t * current_global_part_weights,
935  const mj_scalar_t * current_local_part_weights,
936  const mj_scalar_t *current_part_target_weights,
937  bool *current_cut_line_determined,
938  mj_scalar_t *current_cut_coordinates,
939  mj_scalar_t *current_cut_upper_bounds,
940  mj_scalar_t *current_cut_lower_bounds,
941  mj_scalar_t *current_global_left_closest_points,
942  mj_scalar_t *current_global_right_closest_points,
943  mj_scalar_t * current_cut_lower_bound_weights,
944  mj_scalar_t * current_cut_upper_weights,
945  mj_scalar_t *new_current_cut_coordinates,
946  mj_scalar_t *current_part_cut_line_weight_to_put_left,
947  mj_part_t *rectilinear_cut_count,
948  mj_part_t &my_num_incomplete_cut);
949 
959  void mj_calculate_new_cut_position (
960  mj_scalar_t cut_upper_bound,
961  mj_scalar_t cut_lower_bound,
962  mj_scalar_t cut_upper_weight,
963  mj_scalar_t cut_lower_weight,
964  mj_scalar_t expected_weight,
965  mj_scalar_t &new_cut_position);
966 
977  void mj_create_new_partitions(
978  mj_part_t num_parts,
979  mj_scalar_t *mj_current_dim_coords,
980  mj_scalar_t *current_concurrent_cut_coordinate,
981  mj_lno_t coordinate_begin,
982  mj_lno_t coordinate_end,
983  mj_scalar_t *used_local_cut_line_weight_to_left,
984  double **used_thread_part_weight_work,
985  mj_lno_t *out_part_xadj);
986 
1009  bool mj_perform_migration(
1010  mj_part_t in_num_parts, //current umb parts
1011  mj_part_t &out_num_parts, //output umb parts.
1012  std::vector<mj_part_t> *next_future_num_parts_in_parts,
1013  mj_part_t &output_part_begin_index,
1014  size_t migration_reduce_all_population,
1015  mj_lno_t num_coords_for_last_dim_part,
1016  std::string iteration,
1017  RCP<mj_partBoxVector_t> &input_part_boxes,
1018  RCP<mj_partBoxVector_t> &output_part_boxes);
1019 
1029  void get_processor_num_points_in_parts(
1030  mj_part_t num_procs,
1031  mj_part_t num_parts,
1032  mj_gno_t *&num_points_in_all_processor_parts);
1033 
1046  bool mj_check_to_migrate(
1047  size_t migration_reduce_all_population,
1048  mj_lno_t num_coords_for_last_dim_part,
1049  mj_part_t num_procs,
1050  mj_part_t num_parts,
1051  mj_gno_t *num_points_in_all_processor_parts);
1052 
1053 
1071  void mj_migration_part_proc_assignment(
1072  mj_gno_t * num_points_in_all_processor_parts,
1073  mj_part_t num_parts,
1074  mj_part_t num_procs,
1075  mj_lno_t *send_count_to_each_proc,
1076  std::vector<mj_part_t> &processor_ranks_for_subcomm,
1077  std::vector<mj_part_t> *next_future_num_parts_in_parts,
1078  mj_part_t &out_num_part,
1079  std::vector<mj_part_t> &out_part_indices,
1080  mj_part_t &output_part_numbering_begin_index,
1081  int *coordinate_destinations);
1082 
1099  void mj_assign_proc_to_parts(
1100  mj_gno_t * num_points_in_all_processor_parts,
1101  mj_part_t num_parts,
1102  mj_part_t num_procs,
1103  mj_lno_t *send_count_to_each_proc,
1104  std::vector<mj_part_t> &processor_ranks_for_subcomm,
1105  std::vector<mj_part_t> *next_future_num_parts_in_parts,
1106  mj_part_t &out_part_index,
1107  mj_part_t &output_part_numbering_begin_index,
1108  int *coordinate_destinations);
1109 
1120  void assign_send_destinations(
1121  mj_part_t num_parts,
1122  mj_part_t *part_assignment_proc_begin_indices,
1123  mj_part_t *processor_chains_in_parts,
1124  mj_lno_t *send_count_to_each_proc,
1125  int *coordinate_destinations);
1126 
1139  void assign_send_destinations2(
1140  mj_part_t num_parts,
1141  uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment, //input sorted wrt processors
1142  int *coordinate_destinations,
1143  mj_part_t &output_part_numbering_begin_index,
1144  std::vector<mj_part_t> *next_future_num_parts_in_parts);
1145 
1162  void mj_assign_parts_to_procs(
1163  mj_gno_t * num_points_in_all_processor_parts,
1164  mj_part_t num_parts,
1165  mj_part_t num_procs,
1166  mj_lno_t *send_count_to_each_proc, //output: sized nprocs, show the number of send point counts to each proc.
1167  std::vector<mj_part_t> *next_future_num_parts_in_parts,//input how many more partitions the part will be partitioned into.
1168  mj_part_t &out_num_part, //output, how many parts the processor will have. this is always 1 for this function.
1169  std::vector<mj_part_t> &out_part_indices, //output: the part indices which the processor is assigned to.
1170  mj_part_t &output_part_numbering_begin_index, //output: how much the part number should be shifted when setting the solution
1171  int *coordinate_destinations);
1172 
1185  void mj_migrate_coords(
1186  mj_part_t num_procs,
1187  mj_lno_t &num_new_local_points,
1188  std::string iteration,
1189  int *coordinate_destinations,
1190  mj_part_t num_parts);
1191 
1198  void create_sub_communicator(std::vector<mj_part_t> &processor_ranks_for_subcomm);
1199 
1200 
1206  void fill_permutation_array(
1207  mj_part_t output_num_parts,
1208  mj_part_t num_parts);
1209 
1218  void set_final_parts(
1219  mj_part_t current_num_parts,
1220  mj_part_t output_part_begin_index,
1221  RCP<mj_partBoxVector_t> &output_part_boxes,
1222  bool is_data_ever_migrated);
1225  void free_work_memory();
1239  void create_consistent_chunks(
1240  mj_part_t num_parts,
1241  mj_scalar_t *mj_current_dim_coords,
1242  mj_scalar_t *current_concurrent_cut_coordinate,
1243  mj_lno_t coordinate_begin,
1244  mj_lno_t coordinate_end,
1245  mj_scalar_t *used_local_cut_line_weight_to_left,
1246  mj_lno_t *out_part_xadj,
1247  int coordInd, bool longest_dim_part, uSignedSortItem<int, mj_scalar_t, char> *p_coord_dimension_range_sorted);
1248 
1253  mj_part_t find_largest_prime_factor(mj_part_t num_parts){
1254  mj_part_t largest_factor = 1;
1255  mj_part_t n = num_parts;
1256  mj_part_t divisor = 2;
1257  while (n > 1){
1258  while (n % divisor == 0){
1259  n = n / divisor;
1260  largest_factor = divisor;
1261  }
1262  ++divisor;
1263  if (divisor * divisor > n){
1264  if (n > 1){
1265  largest_factor = n;
1266  }
1267  break;
1268  }
1269  }
1270  return largest_factor;
1271  }
1272 public:
1273  AlgMJ();
1274 
1302  void multi_jagged_part(
1303  const RCP<const Environment> &env,
1304  RCP<const Comm<int> > &problemComm,
1305 
1306  double imbalance_tolerance,
1307  size_t num_global_parts,
1308  mj_part_t *part_no_array,
1309  int recursion_depth,
1310 
1311  int coord_dim,
1312  mj_lno_t num_local_coords,
1313  mj_gno_t num_global_coords,
1314  const mj_gno_t *initial_mj_gnos,
1315  mj_scalar_t **mj_coordinates,
1316 
1317  int num_weights_per_coord,
1318  bool *mj_uniform_weights,
1319  mj_scalar_t **mj_weights,
1320  bool *mj_uniform_parts,
1321  mj_scalar_t **mj_part_sizes,
1322 
1323  mj_part_t *&result_assigned_part_ids,
1324  mj_gno_t *&result_mj_gnos);
1325 
1326 
1336  bool distribute_points_on_cut_lines_,
1337  int max_concurrent_part_calculation_,
1338  int check_migrate_avoid_migration_option_,
1339  double minimum_migration_imbalance_, int migration_type_ = 0);
1340 
1344  void set_to_keep_part_boxes();
1345 
1348  RCP<mj_partBox_t> get_global_box() const;
1349 
1350  RCP<mj_partBoxVector_t> get_kept_boxes() const;
1351 
1352  RCP<mj_partBoxVector_t> compute_global_box_boundaries(
1353  RCP<mj_partBoxVector_t> &localPartBoxes) const;
1354 
1398  const RCP<const Environment> &env,
1399  mj_lno_t num_total_coords,
1400  mj_lno_t num_selected_coords,
1401  size_t num_target_part,
1402  int coord_dim,
1403  mj_scalar_t **mj_coordinates,
1404  mj_lno_t *initial_selected_coords_output_permutation,
1405  mj_lno_t *output_xadj,
1406  int recursion_depth,
1407  const mj_part_t *part_no_array,
1408  bool partition_along_longest_dim,
1409  int num_ranks_per_node,
1410  bool divide_to_prime_first_,
1411  mj_part_t num_first_level_parts_ = 1,
1412  const mj_part_t *first_level_distribution_ = NULL);
1413 
1414 };
1415 
1458 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
1459  typename mj_part_t>
1461  const RCP<const Environment> &env,
1462  mj_lno_t num_total_coords,
1463  mj_lno_t num_selected_coords,
1464  size_t num_target_part,
1465  int coord_dim_,
1466  mj_scalar_t **mj_coordinates_,
1467  mj_lno_t *inital_adjList_output_adjlist,
1468  mj_lno_t *output_xadj,
1469  int rd,
1470  const mj_part_t *part_no_array_,
1471  bool partition_along_longest_dim,
1472  int num_ranks_per_node,
1473  bool divide_to_prime_first_,
1474  mj_part_t num_first_level_parts_,
1475  const mj_part_t *first_level_distribution_) {
1476 
1477  this->mj_env = env;
1478  const RCP<Comm<int> > commN;
1479  this->mj_problemComm =
1480  Teuchos::DefaultComm<int>::getDefaultSerialComm(commN);
1481  this->comm =
1482  Teuchos::rcp_const_cast<Comm<int> >(this->mj_problemComm);
1483  this->myActualRank = this->myRank = 1;
1484 
1485 #ifdef HAVE_ZOLTAN2_OMP
1486  //int actual_num_threads = omp_get_num_threads();
1487  //omp_set_num_threads(1);
1488 #endif
1489 
1490  this->divide_to_prime_first = divide_to_prime_first_;
1491  //weights are uniform for task mapping
1492 
1493  //parts are uniform for task mapping
1494  //as input indices.
1495  this->imbalance_tolerance = 0;
1496  this->num_global_parts = num_target_part;
1497  this->part_no_array = (mj_part_t *)part_no_array_;
1498  this->recursion_depth = rd;
1499 
1500  // If nonuniform first level partitioning, the requested num of parts and the requested distribution of
1501  // elements for each part
1502  this->num_first_level_parts = num_first_level_parts_;
1503  this->first_level_distribution = (mj_part_t *)first_level_distribution_;
1504 
1505  this->coord_dim = coord_dim_;
1506  this->num_local_coords = num_total_coords;
1507  this->num_global_coords = num_total_coords;
1508  this->mj_coordinates = mj_coordinates_; //will copy the memory to this->mj_coordinates.
1509 
1512  this->initial_mj_gnos = allocMemory<mj_gno_t>(this->num_local_coords);
1513 
1514  this->num_weights_per_coord = 0;
1515  bool *tmp_mj_uniform_weights = new bool[1];
1516  this->mj_uniform_weights = tmp_mj_uniform_weights;
1517  this->mj_uniform_weights[0] = true;
1518 
1519  mj_scalar_t **tmp_mj_weights = new mj_scalar_t *[1];
1520  this->mj_weights = tmp_mj_weights; //will copy the memory to this->mj_weights
1521 
1522  bool *tmp_mj_uniform_parts = new bool[1];
1523  this->mj_uniform_parts = tmp_mj_uniform_parts;
1524  this->mj_uniform_parts[0] = true;
1525 
1526  mj_scalar_t **tmp_mj_part_sizes = new mj_scalar_t * [1];
1527  this->mj_part_sizes = tmp_mj_part_sizes;
1528  this->mj_part_sizes[0] = NULL;
1529 
1530  this->num_threads = 1;
1531  this->set_part_specifications();
1532 
1533  this->allocate_set_work_memory();
1534  //the end of the initial partition is the end of coordinates.
1535  this->part_xadj[0] = static_cast<mj_lno_t>(num_selected_coords);
1536  for(size_t i = 0; i < static_cast<size_t>(num_total_coords); ++i){
1537  this->coordinate_permutations[i] = inital_adjList_output_adjlist[i];
1538  }
1539 
1540  mj_part_t current_num_parts = 1;
1541 
1542  mj_scalar_t *current_cut_coordinates = this->all_cut_coordinates;
1543 
1544  mj_part_t future_num_parts = this->total_num_part;
1545 
1546  std::vector<mj_part_t> *future_num_part_in_parts = new std::vector<mj_part_t> ();
1547  std::vector<mj_part_t> *next_future_num_parts_in_parts = new std::vector<mj_part_t> ();
1548  next_future_num_parts_in_parts->push_back(this->num_global_parts);
1549  RCP<mj_partBoxVector_t> t1;
1550  RCP<mj_partBoxVector_t> t2;
1551 
1552 
1553  std::vector <uSignedSortItem<int, mj_scalar_t, char> > coord_dimension_range_sorted(this->coord_dim);
1554  uSignedSortItem<int, mj_scalar_t, char> *p_coord_dimension_range_sorted = &(coord_dimension_range_sorted[0]);
1555  std::vector <mj_scalar_t> coord_dim_mins(this->coord_dim);
1556  std::vector <mj_scalar_t> coord_dim_maxs(this->coord_dim);
1557 
1558  for (int i = 0; i < this->recursion_depth; ++i) {
1559 
1560  //partitioning array. size will be as the number of current partitions and this
1561  //holds how many parts that each part will be in the current dimension partitioning.
1562  std::vector <mj_part_t> num_partitioning_in_current_dim;
1563 
1564  //number of parts that will be obtained at the end of this partitioning.
1565  //future_num_part_in_parts is as the size of current number of parts.
1566  //holds how many more parts each should be divided in the further
1567  //iterations. this will be used to calculate num_partitioning_in_current_dim,
1568  //as the number of parts that the part will be partitioned
1569  //in the current dimension partitioning.
1570 
1571  //next_future_num_parts_in_parts will be as the size of outnumParts,
1572  //and this will hold how many more parts that each output part
1573  //should be divided. this array will also be used to determine the weight ratios
1574  //of the parts.
1575  //swap the arrays to use iteratively..
1576  std::vector<mj_part_t> *tmpPartVect= future_num_part_in_parts;
1577  future_num_part_in_parts = next_future_num_parts_in_parts;
1578  next_future_num_parts_in_parts = tmpPartVect;
1579 
1580  //clear next_future_num_parts_in_parts array as
1581  //getPartitionArrays expects it to be empty.
1582  //it also expects num_partitioning_in_current_dim to be empty as well.
1583  next_future_num_parts_in_parts->clear();
1584 
1585 
1586  //returns the total number of output parts for this dimension partitioning.
1587  mj_part_t output_part_count_in_dimension =
1588  this->update_part_num_arrays(
1589  num_partitioning_in_current_dim,
1590  future_num_part_in_parts,
1591  next_future_num_parts_in_parts,
1592  future_num_parts,
1593  current_num_parts,
1594  i,
1595  t1,
1596  t2, num_ranks_per_node);
1597 
1598  //if the number of obtained parts equal to current number of parts,
1599  //skip this dimension. For example, this happens when 1 is given in the input
1600  //part array is given. P=4,5,1,2
1601  if(output_part_count_in_dimension == current_num_parts) {
1602  tmpPartVect= future_num_part_in_parts;
1603  future_num_part_in_parts = next_future_num_parts_in_parts;
1604  next_future_num_parts_in_parts = tmpPartVect;
1605  continue;
1606  }
1607 
1608  //convert i to string to be used for debugging purposes.
1609  std::string istring = Teuchos::toString<int>(i);
1610 
1611  //alloc Memory to point the indices
1612  //of the parts in the permutation array.
1613  this->new_part_xadj = allocMemory<mj_lno_t>(output_part_count_in_dimension);
1614 
1615  //the index where in the outtotalCounts will be written.
1616  mj_part_t output_part_index = 0;
1617  //whatever is written to outTotalCounts will be added with previousEnd
1618  //so that the points will be shifted.
1619  mj_part_t output_coordinate_end_index = 0;
1620 
1621  mj_part_t current_work_part = 0;
1622  mj_part_t current_concurrent_num_parts = 1;
1623 
1624  mj_part_t obtained_part_index = 0;
1625 
1626  //get the coordinate axis along which the partitioning will be done.
1627  int coordInd = i % this->coord_dim;
1628  mj_scalar_t * mj_current_dim_coords = this->mj_coordinates[coordInd];
1629 
1630 
1631  //run for all available parts.
1632  for (; current_work_part < current_num_parts;
1633  current_work_part += current_concurrent_num_parts) {
1634 
1635 
1636  //current_concurrent_num_parts = std::min(current_num_parts - current_work_part,
1637  //this->max_concurrent_part_calculation);
1638 
1639  mj_part_t actual_work_part_count = 0;
1640  //initialization for 1D partitioning.
1641  //get the min and max coordinates of each part
1642  //together with the part weights of each part.
1643  for (int kk = 0; kk < current_concurrent_num_parts; ++kk) {
1644  mj_part_t current_work_part_in_concurrent_parts = current_work_part + kk;
1645 
1646  //if this part wont be partitioned any further
1647  //dont do any work for this part.
1648  if (num_partitioning_in_current_dim[current_work_part_in_concurrent_parts] == 1){
1649  continue;
1650  }
1651  ++actual_work_part_count;
1652  mj_lno_t coordinate_end_index= this->part_xadj[current_work_part_in_concurrent_parts];
1653  mj_lno_t coordinate_begin_index = current_work_part_in_concurrent_parts == 0 ?
1654  0 : this->part_xadj[current_work_part_in_concurrent_parts - 1];
1655 
1656 /*
1657  std::cout << "\n\ni:" << i << " j:" << current_work_part + kk
1658  << " coordinate_begin_index:" << coordinate_begin_index
1659  << " coordinate_end_index:" << coordinate_end_index
1660  << " total:" << coordinate_end_index - coordinate_begin_index << "\n\n";
1661 */
1662 
1663 
1664  if (partition_along_longest_dim) {
1665 
1666  mj_scalar_t best_weight_coord = 0;
1667  for (int coord_traverse_ind = 0; coord_traverse_ind < this->coord_dim; ++coord_traverse_ind){
1668  mj_scalar_t best_min_coord = 0;
1669  mj_scalar_t best_max_coord = 0;
1670  //MD:same for all coordinates, but I will still use this for now.
1671 
1672  this->mj_get_local_min_max_coord_totW(
1673  coordinate_begin_index,
1674  coordinate_end_index,
1675  this->coordinate_permutations,
1676  this->mj_coordinates[coord_traverse_ind],
1677  best_min_coord, //min coordinate
1678  best_max_coord, //max coordinate
1679  best_weight_coord //total weight);
1680  );
1681 
1682  coord_dim_mins[coord_traverse_ind] = best_min_coord;
1683  coord_dim_maxs[coord_traverse_ind] = best_max_coord;
1684  mj_scalar_t best_range = best_max_coord - best_min_coord;
1685  coord_dimension_range_sorted[coord_traverse_ind].id = coord_traverse_ind;
1686  coord_dimension_range_sorted[coord_traverse_ind].val = best_range;
1687  coord_dimension_range_sorted[coord_traverse_ind].signbit = 1;
1688  }
1689 
1690 
1691  uqSignsort(this->coord_dim, p_coord_dimension_range_sorted);
1692  coordInd = p_coord_dimension_range_sorted[this->coord_dim - 1].id;
1693 
1694 /*
1695  std::cout << "\n\n";
1696  for (int coord_traverse_ind = 0; coord_traverse_ind < this->coord_dim; ++coord_traverse_ind){
1697  std::cout << "i:" << p_coord_dimension_range_sorted[coord_traverse_ind].id
1698  << " range:" << p_coord_dimension_range_sorted[coord_traverse_ind].val << std::endl;
1699  std::cout << "i:" << p_coord_dimension_range_sorted[coord_traverse_ind].id
1700  << " coord_dim_mins:" << coord_dim_mins[p_coord_dimension_range_sorted[coord_traverse_ind].id]<< std::endl;
1701  std::cout << "i:" << p_coord_dimension_range_sorted[coord_traverse_ind].id
1702  << " coord_dim_maxs:" << coord_dim_maxs[p_coord_dimension_range_sorted[coord_traverse_ind].id] << std::endl;
1703  }
1704  std::cout << "\n\n";
1705 */
1706 
1707  mj_current_dim_coords = this->mj_coordinates[coordInd];
1708 
1709  this->process_local_min_max_coord_total_weight[kk] = coord_dim_mins[coordInd];
1710  this->process_local_min_max_coord_total_weight[kk+ current_concurrent_num_parts] = coord_dim_maxs[coordInd];
1711  this->process_local_min_max_coord_total_weight[kk + 2*current_concurrent_num_parts] = best_weight_coord;
1712 
1713  }
1714  else{
1715  this->mj_get_local_min_max_coord_totW(
1716  coordinate_begin_index,
1717  coordinate_end_index,
1718  this->coordinate_permutations,
1719  mj_current_dim_coords,
1720  this->process_local_min_max_coord_total_weight[kk], //min coordinate
1721  this->process_local_min_max_coord_total_weight[kk + current_concurrent_num_parts], //max coordinate
1722  this->process_local_min_max_coord_total_weight[kk + 2*current_concurrent_num_parts] //total weight);
1723  );
1724  }
1725  }
1726 
1727  //1D partitioning
1728  if (actual_work_part_count > 0) {
1729  //obtain global Min max of the part.
1730  this->mj_get_global_min_max_coord_totW(
1731  current_concurrent_num_parts,
1732  this->process_local_min_max_coord_total_weight,
1733  this->global_min_max_coord_total_weight);
1734 
1735  //represents the total number of cutlines
1736  //whose coordinate should be determined.
1737  mj_part_t total_incomplete_cut_count = 0;
1738 
1739  //Compute weight ratios for parts & cuts:
1740  //e.g., 0.25 0.25 0.5 0.5 0.75 0.75 1.0
1741  // part0 cut0 part1 cut1 part2 cut2 part3
1742  mj_part_t concurrent_part_cut_shift = 0;
1743  mj_part_t concurrent_part_part_shift = 0;
1744 
1745 
1746  for (int kk = 0; kk < current_concurrent_num_parts; ++kk) {
1747  mj_scalar_t min_coordinate = this->global_min_max_coord_total_weight[kk];
1748  mj_scalar_t max_coordinate = this->global_min_max_coord_total_weight[kk +
1749  current_concurrent_num_parts];
1750  mj_scalar_t global_total_weight = this->global_min_max_coord_total_weight[kk +
1751  2 * current_concurrent_num_parts];
1752 
1753  mj_part_t concurrent_current_part_index = current_work_part + kk;
1754 
1755  mj_part_t partition_count = num_partitioning_in_current_dim[concurrent_current_part_index];
1756 
1757  mj_scalar_t *usedCutCoordinate = current_cut_coordinates + concurrent_part_cut_shift;
1758  mj_scalar_t *current_target_part_weights = this->target_part_weights +
1759  concurrent_part_part_shift;
1760  //shift the usedCutCoordinate array as noCuts.
1761  concurrent_part_cut_shift += partition_count - 1;
1762  //shift the partRatio array as noParts.
1763  concurrent_part_part_shift += partition_count;
1764 
1765  //calculate only if part is not empty,
1766  //and part will be further partitioend.
1767  if(partition_count > 1 && min_coordinate <= max_coordinate){
1768 
1769  //increase allDone by the number of cuts of the current
1770  //part's cut line number.
1771  total_incomplete_cut_count += partition_count - 1;
1772  //set the number of cut lines that should be determined
1773  //for this part.
1774  this->my_incomplete_cut_count[kk] = partition_count - 1;
1775 
1776  // Nonuniform partitioning on the first level, providing
1777  // requested number of parts (num_first_level_parts) and
1778  // requested distribution in parts (first_level_distribution)
1779  if (i == 0 &&
1780  first_level_distribution != NULL &&
1781  num_first_level_parts > 1) {
1782  // Get the target part weights given a desired distribution
1783  this->mj_get_initial_cut_coords_target_weights(
1784  min_coordinate,
1785  max_coordinate,
1786  partition_count - 1,
1787  global_total_weight,
1788  usedCutCoordinate,
1789  current_target_part_weights,
1790  future_num_part_in_parts,
1791  next_future_num_parts_in_parts,
1792  concurrent_current_part_index,
1793  obtained_part_index,
1794  this->num_first_level_parts,
1795  this->first_level_distribution);
1796  }
1797  // Uniform partitioning
1798  else {
1799 
1800  //get the target weights of the parts.
1801  this->mj_get_initial_cut_coords_target_weights(
1802  min_coordinate,
1803  max_coordinate,
1804  partition_count - 1,
1805  global_total_weight,
1806  usedCutCoordinate,
1807  current_target_part_weights,
1808  future_num_part_in_parts,
1809  next_future_num_parts_in_parts,
1810  concurrent_current_part_index,
1811  obtained_part_index);
1812  }
1813 
1814  mj_lno_t coordinate_end_index = this->part_xadj[concurrent_current_part_index];
1815  mj_lno_t coordinate_begin_index = concurrent_current_part_index == 0 ?
1816  0 : this->part_xadj[concurrent_current_part_index - 1];
1817 
1818  //get the initial estimated part assignments of the coordinates.
1819  this->set_initial_coordinate_parts(
1820  max_coordinate,
1821  min_coordinate,
1822  concurrent_current_part_index,
1823  coordinate_begin_index, coordinate_end_index,
1824  this->coordinate_permutations,
1825  mj_current_dim_coords,
1826  this->assigned_part_ids,
1827  partition_count);
1828 
1829  }
1830  else {
1831  // e.g., if have fewer coordinates than parts, don't need to do next dim.
1832  this->my_incomplete_cut_count[kk] = 0;
1833  }
1834  obtained_part_index += partition_count;
1835  }
1836 
1837  //used imbalance, it is always 0, as it is difficult to estimate a range.
1838  double used_imbalance = 0;
1839 
1840 
1841  // Determine cut lines for k parts here.
1842  this->mj_1D_part(
1843  mj_current_dim_coords,
1844  used_imbalance,
1845  current_work_part,
1846  current_concurrent_num_parts,
1847  current_cut_coordinates,
1848  total_incomplete_cut_count,
1849  num_partitioning_in_current_dim);
1850  }
1851  else {
1852  obtained_part_index += current_concurrent_num_parts;
1853  }
1854 
1855  //create part chunks
1856  {
1857 
1858  mj_part_t output_array_shift = 0;
1859  mj_part_t cut_shift = 0;
1860  size_t tlr_shift = 0;
1861  size_t partweight_array_shift = 0;
1862 
1863  for(int kk = 0; kk < current_concurrent_num_parts; ++kk){
1864  mj_part_t current_concurrent_work_part = current_work_part + kk;
1865  mj_part_t num_parts = num_partitioning_in_current_dim[current_concurrent_work_part];
1866 
1867  //if the part is empty, skip the part.
1868  if((num_parts != 1 ) && this->global_min_max_coord_total_weight[kk] >
1869  this->global_min_max_coord_total_weight[kk + current_concurrent_num_parts]) {
1870 
1871  for(mj_part_t jj = 0; jj < num_parts; ++jj){
1872  this->new_part_xadj[output_part_index + output_array_shift + jj] = 0;
1873  }
1874  cut_shift += num_parts - 1;
1875  tlr_shift += (4 *(num_parts - 1) + 1);
1876  output_array_shift += num_parts;
1877  partweight_array_shift += (2 * (num_parts - 1) + 1);
1878  continue;
1879  }
1880 
1881  mj_lno_t coordinate_end = this->part_xadj[current_concurrent_work_part];
1882  mj_lno_t coordinate_begin = current_concurrent_work_part==0 ? 0: this->part_xadj[current_concurrent_work_part
1883  -1];
1884  mj_scalar_t *current_concurrent_cut_coordinate = current_cut_coordinates + cut_shift;
1885  mj_scalar_t *used_local_cut_line_weight_to_left = this->process_cut_line_weight_to_put_left +
1886  cut_shift;
1887 
1888  for(int ii = 0; ii < this->num_threads; ++ii){
1889  this->thread_part_weight_work[ii] = this->thread_part_weights[ii] + partweight_array_shift;
1890  }
1891 
1892  if(num_parts > 1){
1893  // Rewrite the indices based on the computed cuts.
1894  this->create_consistent_chunks(
1895  num_parts,
1896  mj_current_dim_coords,
1897  current_concurrent_cut_coordinate,
1898  coordinate_begin,
1899  coordinate_end,
1900  used_local_cut_line_weight_to_left,
1901  this->new_part_xadj + output_part_index + output_array_shift,
1902  coordInd,
1903  partition_along_longest_dim,
1904  p_coord_dimension_range_sorted);
1905  }
1906  else {
1907  //if this part is partitioned into 1 then just copy
1908  //the old values.
1909  mj_lno_t part_size = coordinate_end - coordinate_begin;
1910  *(this->new_part_xadj + output_part_index + output_array_shift) = part_size;
1911  memcpy(this->new_coordinate_permutations + coordinate_begin,
1912  this->coordinate_permutations + coordinate_begin,
1913  part_size * sizeof(mj_lno_t));
1914  }
1915 
1916 
1917 
1918  cut_shift += num_parts - 1;
1919  tlr_shift += (4 *(num_parts - 1) + 1);
1920  output_array_shift += num_parts;
1921  partweight_array_shift += (2 * (num_parts - 1) + 1);
1922  }
1923 
1924  //shift cut coordinates so that all cut coordinates are stored.
1925  //current_cut_coordinates += cutShift;
1926 
1927  //getChunks from coordinates partitioned the parts and
1928  //wrote the indices as if there were a single part.
1929  //now we need to shift the beginning indices.
1930  for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
1931  mj_part_t num_parts = num_partitioning_in_current_dim[ current_work_part + kk];
1932  for (mj_part_t ii = 0;ii < num_parts ; ++ii){
1933  //shift it by previousCount
1934  this->new_part_xadj[output_part_index+ii] += output_coordinate_end_index;
1935  if (ii % 2 == 1){
1936  mj_lno_t coordinate_end = this->new_part_xadj[output_part_index+ii];
1937  mj_lno_t coordinate_begin = this->new_part_xadj[output_part_index];
1938 
1939  for (mj_lno_t task_traverse = coordinate_begin; task_traverse < coordinate_end; ++task_traverse){
1940  mj_lno_t l = this->new_coordinate_permutations[task_traverse];
1941  //MARKER: FLIPPED ZORDER BELOW
1942  mj_current_dim_coords[l] = -mj_current_dim_coords[l];
1943  }
1944  }
1945  }
1946  //increase the previous count by current end.
1947  output_coordinate_end_index = this->new_part_xadj[output_part_index + num_parts - 1];
1948  //increase the current out.
1949  output_part_index += num_parts ;
1950  }
1951  }
1952  }
1953  // end of this partitioning dimension
1954 
1955  //set the current num parts for next dim partitioning
1956  current_num_parts = output_part_count_in_dimension;
1957 
1958  //swap the coordinate permutations for the next dimension.
1959  mj_lno_t * tmp = this->coordinate_permutations;
1960  this->coordinate_permutations = this->new_coordinate_permutations;
1961  this->new_coordinate_permutations = tmp;
1962 
1963  freeArray<mj_lno_t>(this->part_xadj);
1964  this->part_xadj = this->new_part_xadj;
1965  this->new_part_xadj = NULL;
1966  }
1967 
1968  for(mj_lno_t i = 0; i < num_total_coords; ++i){
1969  inital_adjList_output_adjlist[i] = this->coordinate_permutations[i];
1970  }
1971 
1972  // Return output_xadj in CSR format
1973  output_xadj[0] = 0;
1974  for(size_t i = 0; i < this->num_global_parts ; ++i){
1975  output_xadj[i+1] = this->part_xadj[i];
1976  }
1977 
1978  delete future_num_part_in_parts;
1979  delete next_future_num_parts_in_parts;
1980 
1981  //free the extra memory that we allocated.
1982  freeArray<mj_part_t>(this->assigned_part_ids);
1983  freeArray<mj_gno_t>(this->initial_mj_gnos);
1984  freeArray<mj_gno_t>(this->current_mj_gnos);
1985  freeArray<bool>(tmp_mj_uniform_weights);
1986  freeArray<bool>(tmp_mj_uniform_parts);
1987  freeArray<mj_scalar_t *>(tmp_mj_weights);
1988  freeArray<mj_scalar_t *>(tmp_mj_part_sizes);
1989 
1990  this->free_work_memory();
1991 
1992 #ifdef HAVE_ZOLTAN2_OMP
1993  //omp_set_num_threads(actual_num_threads);
1994 #endif
1995 }
1996 
2000 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2001  typename mj_part_t>
2003  mj_env(), mj_problemComm(), imbalance_tolerance(0),
2004  part_no_array(NULL), recursion_depth(0), coord_dim(0),
2005  num_weights_per_coord(0), initial_num_loc_coords(0),
2006  initial_num_glob_coords(0),
2007  num_local_coords(0), num_global_coords(0), mj_coordinates(NULL),
2008  mj_weights(NULL), mj_uniform_parts(NULL), mj_part_sizes(NULL),
2009  mj_uniform_weights(NULL), mj_gnos(), num_global_parts(1),
2010  initial_mj_gnos(NULL), current_mj_gnos(NULL), owner_of_coordinate(NULL),
2011  coordinate_permutations(NULL), new_coordinate_permutations(NULL),
2012  assigned_part_ids(NULL), part_xadj(NULL), new_part_xadj(NULL),
2013  distribute_points_on_cut_lines(true), max_concurrent_part_calculation(1),
2014  mj_run_as_rcb(false), mj_user_recursion_depth(0), mj_keep_part_boxes(false),
2015  check_migrate_avoid_migration_option(0), migration_type(0), minimum_migration_imbalance(0.30),
2016  num_threads(1), num_first_level_parts(1), first_level_distribution(NULL),
2017  total_num_cut(0), total_num_part(0), max_num_part_along_dim(0),
2018  max_num_cut_along_dim(0), max_num_total_part_along_dim(0), total_dim_num_reduce_all(0),
2019  last_dim_num_part(0), comm(), fEpsilon(0), sEpsilon(0), maxScalar_t(0), minScalar_t(0),
2020  all_cut_coordinates(NULL), max_min_coords(NULL), process_cut_line_weight_to_put_left(NULL),
2021  thread_cut_line_weight_to_put_left(NULL), cut_coordinates_work_array(NULL),
2022  target_part_weights(NULL), cut_upper_bound_coordinates(NULL), cut_lower_bound_coordinates(NULL),
2023  cut_lower_bound_weights(NULL), cut_upper_bound_weights(NULL),
2024  process_local_min_max_coord_total_weight(NULL), global_min_max_coord_total_weight(NULL),
2025  is_cut_line_determined(NULL), my_incomplete_cut_count(NULL),
2026  thread_part_weights(NULL), thread_part_weight_work(NULL),
2027  thread_cut_left_closest_point(NULL), thread_cut_right_closest_point(NULL),
2028  thread_point_counts(NULL), process_rectilinear_cut_weight(NULL),
2029  global_rectilinear_cut_weight(NULL),total_part_weight_left_right_closests(NULL),
2030  global_total_part_weight_left_right_closests(NULL),
2031  kept_boxes(),global_box(),
2032  myRank(0), myActualRank(0), divide_to_prime_first(false)
2033 {
2034  this->fEpsilon = std::numeric_limits<float>::epsilon();
2035  this->sEpsilon = std::numeric_limits<mj_scalar_t>::epsilon() * 100;
2036 
2037  this->maxScalar_t = std::numeric_limits<mj_scalar_t>::max();
2038  this->minScalar_t = -std::numeric_limits<mj_scalar_t>::max();
2039 
2040 }
2041 
2042 
2046 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2047  typename mj_part_t>
2048 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::mj_partBox_t>
2050 {
2051  return this->global_box;
2052 }
2053 
2057 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2058  typename mj_part_t>
2060  this->mj_keep_part_boxes = true;
2061 }
2062 
2063 
2064 /* \brief Either the mj array (part_no_array) or num_global_parts should be provided in
2065  * the input. part_no_array takes
2066  * precedence if both are provided.
2067  * Depending on these parameters, total cut/part number,
2068  * maximum part/cut number along a dimension, estimated number of reduceAlls,
2069  * and the number of parts before the last dimension is calculated.
2070  * */
2071 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2072  typename mj_part_t>
2074 
2075  this->total_num_cut = 0; //how many cuts will be totally
2076  this->total_num_part = 1; //how many parts will be totally
2077  this->max_num_part_along_dim = 0; //maximum part count along a dimension.
2078  this->total_dim_num_reduce_all = 0; //estimate on #reduceAlls can be done.
2079  this->last_dim_num_part = 1; //max no of parts that might occur
2080  //during the partition before the
2081  //last partitioning dimension.
2082  this->max_num_cut_along_dim = 0;
2083  this->max_num_total_part_along_dim = 0;
2084 
2085  if (this->part_no_array) {
2086  //if user provided part array, traverse the array and set variables.
2087  for (int i = 0; i < this->recursion_depth; ++i){
2088  this->total_dim_num_reduce_all += this->total_num_part;
2089  this->total_num_part *= this->part_no_array[i];
2090  if(this->part_no_array[i] > this->max_num_part_along_dim) {
2091  this->max_num_part_along_dim = this->part_no_array[i];
2092  }
2093  }
2094  this->last_dim_num_part = this->total_num_part / this->part_no_array[recursion_depth-1];
2095  this->num_global_parts = this->total_num_part;
2096  }
2097  else {
2098  mj_part_t future_num_parts = this->num_global_parts;
2099 
2100  // If using nonuniform first level partitioning.
2101  // initial value max_num_part_along_dim == num_first_level_parts
2102  if (this->first_level_distribution != NULL &&
2103  this->num_first_level_parts > 1) {
2104  this->max_num_part_along_dim = this->num_first_level_parts;
2105  }
2106 
2107  // We need to calculate the part numbers now, to determine the maximum along the dimensions.
2108  for (int rd = 0; rd < this->recursion_depth; ++rd){
2109 
2110  mj_part_t maxNoPartAlongI = 0;
2111  mj_part_t nfutureNumParts = 0;
2112 
2113  // Nonuniform first level partitioning sets part specificiations for rd == 0 only,
2114  // given requested num of parts and distribution in parts for the first level.
2115  if (rd == 0 &&
2116  this->first_level_distribution != NULL &&
2117  this->num_first_level_parts > 1) {
2118 
2119  maxNoPartAlongI = this->num_first_level_parts;
2120  this->max_num_part_along_dim = this->num_first_level_parts;
2121 
2122  mj_part_t sum_first_level_dist = 0;
2123  mj_part_t max_part = 0;
2124 
2125  // Cumulative sum of distribution of parts and size of largest part
2126  for (int i = 0; i < this->num_first_level_parts; ++i) {
2127 
2128  sum_first_level_dist += this->first_level_distribution[i];
2129 
2130  if (this->first_level_distribution[i] > max_part)
2131  max_part = this->first_level_distribution[i];
2132  }
2133 
2134  // Total parts in largest nonuniform superpart from first level partitioning
2135  nfutureNumParts = this->num_global_parts * max_part / sum_first_level_dist;
2136 
2137  }
2138  // Standard uniform partitioning this level
2139  else {
2140  maxNoPartAlongI = this->get_part_count(future_num_parts,
2141  1.0f / (this->recursion_depth - rd));
2142 
2143  if (maxNoPartAlongI > this->max_num_part_along_dim)
2144  this->max_num_part_along_dim = maxNoPartAlongI;
2145 
2146 
2147  nfutureNumParts = future_num_parts / maxNoPartAlongI;
2148  if (future_num_parts % maxNoPartAlongI){
2149  ++nfutureNumParts;
2150  }
2151  }
2152 
2153  future_num_parts = nfutureNumParts;
2154  }
2155  this->total_num_part = this->num_global_parts;
2156 
2157  if (this->divide_to_prime_first){
2158  this->total_dim_num_reduce_all = this->num_global_parts * 2;
2159  this->last_dim_num_part = this->num_global_parts;
2160  }
2161  else {
2162  //this is the lower bound.
2163 
2164  //estimate reduceAll Count here.
2165  //we find the upperbound instead.
2166  size_t p = 1;
2167 
2168  for (int i = 0; i < this->recursion_depth; ++i){
2169  this->total_dim_num_reduce_all += p;
2170  p *= this->max_num_part_along_dim;
2171  }
2172 
2173  if (p / this->max_num_part_along_dim > this->num_global_parts){
2174  this->last_dim_num_part = this->num_global_parts;
2175  }
2176  else {
2177  this->last_dim_num_part = p / this->max_num_part_along_dim;
2178  }
2179 
2180  }
2181  }
2182 
2183  this->total_num_cut = this->total_num_part - 1;
2184  this->max_num_cut_along_dim = this->max_num_part_along_dim - 1;
2185  this->max_num_total_part_along_dim = this->max_num_part_along_dim + size_t(this->max_num_cut_along_dim);
2186  //maxPartNo is P, maxCutNo = P-1, matTotalPartcount = 2P-1
2187 
2188  //refine the concurrent part count, if it is given bigger than the maximum possible part count.
2189  if(this->max_concurrent_part_calculation > this->last_dim_num_part){
2190  if(this->mj_problemComm->getRank() == 0){
2191  std::cerr << "Warning: Concurrent part count ("<< this->max_concurrent_part_calculation <<
2192  ") has been set bigger than maximum amount that can be used." <<
2193  " Setting to:" << this->last_dim_num_part << "." << std::endl;
2194  }
2195  this->max_concurrent_part_calculation = this->last_dim_num_part;
2196  }
2197 
2198 }
2199 /* \brief Tries to determine the part number for current dimension,
2200  * by trying to make the partitioning as square as possible.
2201  * \param num_total_future how many more partitionings are required.
2202  * \param root how many more recursion depth is left.
2203  */
2204 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2205  typename mj_part_t>
2206 inline mj_part_t AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::get_part_count(
2207  mj_part_t num_total_future,
2208  double root)
2209 {
2210  double fp = pow(num_total_future, root);
2211  mj_part_t ip = mj_part_t (fp);
2212  if (fp - ip < this->fEpsilon * 100){
2213  return ip;
2214  }
2215  else {
2216  return ip + 1;
2217  }
2218 }
2219 
2220 /* \brief Function returns how many parts that will be obtained after this dimension partitioning.
2221  * It sets how many parts each current part will be partitioned into in this dimension to num_partitioning_in_current_dim vector,
2222  * sets how many total future parts each obtained part will be partitioned into in next_future_num_parts_in_parts vector,
2223  * If part boxes are kept, then sets initializes the output_part_boxes as its ancestor.
2224  *
2225  * \param num_partitioning_in_current_dim: output. How many parts each current part will be partitioned into.
2226  * \param future_num_part_in_parts: input, how many future parts each current part will be partitioned into.
2227  * \param next_future_num_parts_in_parts: output, how many future parts each obtained part will be partitioned into.
2228  * \param future_num_parts: input/output, max number of future parts that will be obtained from a single
2229  * \param current_num_parts: input, how many parts are there currently.
2230  * \param current_iteration: input, current dimension iteration number.
2231  * \param input_part_boxes: input, if boxes are kept, current boxes.
2232  * \param output_part_boxes: output, if boxes are kept, the initial box boundaries for obtained parts.
2233  */
2234 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2235  typename mj_part_t>
2236 mj_part_t AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::update_part_num_arrays(
2237  std::vector <mj_part_t> &num_partitioning_in_current_dim, //assumes this vector is empty.
2238  std::vector<mj_part_t> *future_num_part_in_parts,
2239  std::vector<mj_part_t> *next_future_num_parts_in_parts, //assumes this vector is empty.
2240  mj_part_t &future_num_parts,
2241  mj_part_t current_num_parts,
2242  int current_iteration,
2243  RCP<mj_partBoxVector_t> input_part_boxes,
2244  RCP<mj_partBoxVector_t> output_part_boxes,
2245  mj_part_t atomic_part_count) {
2246 
2247  //how many parts that will be obtained after this dimension.
2248  mj_part_t output_num_parts = 0;
2249 
2250  if(this->part_no_array){
2251  //when the partNo array is provided as input,
2252  //each current partition will be partition to the same number of parts.
2253  //we dont need to use the future_num_part_in_parts vector in this case.
2254 
2255  mj_part_t p = this->part_no_array[current_iteration];
2256  if (p < 1){
2257  std::cout << "Current recursive iteration: " << current_iteration
2258  << " part_no_array[" << current_iteration << "] is given as:" << p << std::endl;
2259  exit(1);
2260  }
2261  if (p == 1){
2262  return current_num_parts;
2263  }
2264  // If using part_no_array, ensure compatibility with num_first_level_parts.
2265  if (this->first_level_distribution != NULL &&
2266  current_iteration == 0 &&
2267  p != this->num_first_level_parts)
2268  {
2269  std::cout << "Current recursive iteration: " << current_iteration
2270  << " part_no_array[" << current_iteration << "] is given as: " << p
2271  << " and contradicts num_first_level_parts: " << this->num_first_level_parts << std::endl;
2272  exit(1);
2273  }
2274 
2275  for (mj_part_t ii = 0; ii < current_num_parts; ++ii){
2276  num_partitioning_in_current_dim.push_back(p);
2277  }
2278 
2279 /*
2280  std::cout << "\n\nme: " << this->myRank << " current_iteration: " << current_iteration
2281  << " current_num_parts: " << current_num_parts << "\n\n";
2282 
2283  std::cout << "\n\nnum_partitioning_in_current_dim[0]: " << num_partitioning_in_current_dim[0] << "\n\n";
2284 
2285  //set the new value of future_num_parts.
2286 
2287  std::cout << "\n\nfuture_num_parts: " << future_num_parts
2288  << " num_partitioning_in_current_dim[0]: " << num_partitioning_in_current_dim[0]
2289  << " " << future_num_parts / num_partitioning_in_current_dim[0] << "\n\n";
2290 */
2291 
2292  future_num_parts /= num_partitioning_in_current_dim[0];
2293  output_num_parts = current_num_parts * num_partitioning_in_current_dim[0];
2294 
2295  if (this->mj_keep_part_boxes){
2296  for (mj_part_t k = 0; k < current_num_parts; ++k){
2297  //initialized the output boxes as its ancestor.
2298  for (mj_part_t j = 0; j < num_partitioning_in_current_dim[0]; ++j){
2299  output_part_boxes->push_back((*input_part_boxes)[k]);
2300  }
2301  }
2302  }
2303 
2304  //set the how many more parts each part will be divided.
2305  //this is obvious when partNo array is provided as input.
2306  //however, fill this so that weights will be calculated according to this array.
2307  for (mj_part_t ii = 0; ii < output_num_parts; ++ii){
2308  next_future_num_parts_in_parts->push_back(future_num_parts);
2309  }
2310  }
2311  else {
2312  //if partNo array is not provided as input,
2313  //future_num_part_in_parts holds how many parts each part should be divided.
2314  //initially it holds a single number equal to the total number of global parts.
2315 
2316  //calculate the future_num_parts from beginning,
2317  //since each part might be divided into different number of parts.
2318  future_num_parts = 1;
2319 
2320  //std::cout << "i:" << i << std::endl;
2321 
2322  for (mj_part_t ii = 0; ii < current_num_parts; ++ii){
2323  //get how many parts a part should be divided.
2324  mj_part_t future_num_parts_of_part_ii = (*future_num_part_in_parts)[ii];
2325 
2326  //get the ideal number of parts that is close to the
2327  //(recursion_depth - i) root of the future_num_parts_of_part_ii.
2328  mj_part_t num_partitions_in_current_dim =
2329  this->get_part_count(future_num_parts_of_part_ii,
2330  1.0 / (this->recursion_depth - current_iteration) );
2331 
2332  if (num_partitions_in_current_dim > this->max_num_part_along_dim){
2333  std::cerr << "ERROR: maxPartNo calculation is wrong. num_partitions_in_current_dim: "
2334  << num_partitions_in_current_dim << " this->max_num_part_along_dim: "
2335  << this->max_num_part_along_dim <<
2336  " this->recursion_depth: " << this->recursion_depth <<
2337  " current_iteration: " << current_iteration <<
2338  " future_num_parts_of_part_ii: " << future_num_parts_of_part_ii <<
2339  " might need to fix max part no calculation for largest_prime_first partitioning." <<
2340  std::endl;
2341  exit(1);
2342  }
2343  //add this number to num_partitioning_in_current_dim vector.
2344 // num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
2345 
2346 // mj_part_t largest_prime_factor = num_partitions_in_current_dim;
2347 
2348  // Update part num arrays when on current_iteration == 0 and
2349  // using nonuniform first level partitioning
2350  // with requested num parts (num_first_level_parts) and
2351  // a requested distribution in parts (first_level_distribution).
2352  if (current_iteration == 0 &&
2353  this->first_level_distribution != NULL &&
2354  this->num_first_level_parts > 1) {
2355 
2356  // Only 1 current part to begin and partitions into
2357  // num_first_level_parts many parts
2358  num_partitioning_in_current_dim.push_back(this->num_first_level_parts);
2359 
2360  // The output number of parts from first level partitioning
2361  output_num_parts = this->num_first_level_parts;
2362 
2363  // Remaining parts left to partition for all future levels
2364  future_num_parts /= this->num_first_level_parts;
2365 
2366  mj_part_t max_part = 0;
2367  mj_part_t sum_first_level_dist = 0;
2368 
2369  // Cumulative sum of distribution of first level parts
2370  // and size of largest first level part
2371  for (int i = 0; i < this->num_first_level_parts; ++i) {
2372  sum_first_level_dist += this->first_level_distribution[i];
2373 
2374  if (this->first_level_distribution[i] > max_part)
2375  max_part = this->first_level_distribution[i];
2376  }
2377 
2378  // Maximum # of remaining parts left to partition for all future levels
2379  future_num_parts = this->num_global_parts * max_part / sum_first_level_dist;
2380 
2381  // Number of parts remaining left to partition for each future_part
2382  // The sum must exactly equal global_num_parts
2383  for (int i = 0; i < this->num_first_level_parts; ++i) {
2384 
2385  next_future_num_parts_in_parts->push_back(this->first_level_distribution[i] *
2386  this->num_global_parts / sum_first_level_dist);
2387  }
2388  }
2389  else if (this->divide_to_prime_first) {
2390 
2391  // Add this number to num_partitioning_in_current_dim vector.
2392  num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
2393 
2394  mj_part_t largest_prime_factor = num_partitions_in_current_dim;
2395 
2396  //increase the output number of parts.
2397  output_num_parts += num_partitions_in_current_dim;
2398 
2399  if (future_num_parts_of_part_ii == atomic_part_count ||
2400  future_num_parts_of_part_ii % atomic_part_count != 0) {
2401  atomic_part_count = 1;
2402  }
2403 
2404  largest_prime_factor =
2405  this->find_largest_prime_factor(future_num_parts_of_part_ii / atomic_part_count);
2406 
2407  // We divide to num_partitions_in_current_dim. But we adjust the weights
2408  // based on largest prime/ if num_partitions_in_current_dim = 2,
2409  // largest prime = 5 --> we divide to 2 parts with weights 3x and 2x.
2410  // if the largest prime is less than part count, we use the part count
2411  // so that we divide uniformly.
2412  if (largest_prime_factor < num_partitions_in_current_dim) {
2413  largest_prime_factor = num_partitions_in_current_dim;
2414  }
2415 
2416  //ideal number of future partitions for each part.
2417  mj_part_t ideal_num_future_parts_in_part =
2418  (future_num_parts_of_part_ii / atomic_part_count) / largest_prime_factor;
2419  //if num_partitions_in_current_dim = 2, largest prime = 5 then ideal weight is 2x
2420  mj_part_t ideal_prime_scale = largest_prime_factor / num_partitions_in_current_dim;
2421 
2422 /*
2423  std::cout << "\ncurrent num part: " << ii
2424  << " largest_prime_factor: " << largest_prime_factor
2425  << " To Partition: " << future_num_parts_of_part_ii << "\n\n";
2426 */
2427 
2428  for (mj_part_t iii = 0; iii < num_partitions_in_current_dim; ++iii) {
2429  //if num_partitions_in_current_dim = 2, largest prime = 5 then ideal weight is 2x
2430  mj_part_t my_ideal_primescale = ideal_prime_scale;
2431  //left over weighs. Left side is adjusted to be 3x, right side stays as 2x
2432  if (iii < (largest_prime_factor) % num_partitions_in_current_dim) {
2433  ++my_ideal_primescale;
2434  }
2435  //scale with 'x';
2436  mj_part_t num_future_parts_for_part_iii =
2437  ideal_num_future_parts_in_part * my_ideal_primescale;
2438 
2439  //if there is a remainder in the part increase the part weight.
2440  if (iii < (future_num_parts_of_part_ii / atomic_part_count) % largest_prime_factor) {
2441  //if not uniform, add 1 for the extra parts.
2442  ++num_future_parts_for_part_iii;
2443  }
2444 
2445  next_future_num_parts_in_parts->push_back(num_future_parts_for_part_iii * atomic_part_count);
2446 
2447  //if part boxes are stored, initialize the box of the parts as the ancestor.
2448  if (this->mj_keep_part_boxes) {
2449  output_part_boxes->push_back((*input_part_boxes)[ii]);
2450  }
2451 
2452  //set num future_num_parts to maximum in this part.
2453  if (num_future_parts_for_part_iii > future_num_parts)
2454  future_num_parts = num_future_parts_for_part_iii;
2455 
2456  }
2457  }
2458  else {
2459 
2460  // Add this number to num_partitioning_in_current_dim vector.
2461  num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
2462 
2463  //increase the output number of parts.
2464  output_num_parts += num_partitions_in_current_dim;
2465 
2466  if (future_num_parts_of_part_ii == atomic_part_count ||
2467  future_num_parts_of_part_ii % atomic_part_count != 0) {
2468  atomic_part_count = 1;
2469  }
2470  //ideal number of future partitions for each part.
2471  mj_part_t ideal_num_future_parts_in_part =
2472  (future_num_parts_of_part_ii / atomic_part_count) / num_partitions_in_current_dim;
2473 
2474  for (mj_part_t iii = 0; iii < num_partitions_in_current_dim; ++iii){
2475  mj_part_t num_future_parts_for_part_iii = ideal_num_future_parts_in_part;
2476 
2477  //if there is a remainder in the part increase the part weight.
2478  if (iii < (future_num_parts_of_part_ii / atomic_part_count) % num_partitions_in_current_dim){
2479  //if not uniform, add 1 for the extra parts.
2480  ++num_future_parts_for_part_iii;
2481  }
2482 
2483  next_future_num_parts_in_parts->push_back(num_future_parts_for_part_iii * atomic_part_count);
2484 
2485  //if part boxes are stored, initialize the box of the parts as the ancestor.
2486  if (this->mj_keep_part_boxes){
2487  output_part_boxes->push_back((*input_part_boxes)[ii]);
2488  }
2489 
2490  //set num future_num_parts to maximum in this part.
2491  if (num_future_parts_for_part_iii > future_num_parts)
2492  future_num_parts = num_future_parts_for_part_iii;
2493  }
2494  }
2495  }
2496  }
2497  return output_num_parts;
2498 }
2499 
2500 
2501 /* \brief Allocates and initializes the work memory that will be used by MJ.
2502  *
2503  * */
2504 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2505  typename mj_part_t>
2506 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::allocate_set_work_memory(){
2507 
2508  //points to process that initially owns the coordinate.
2509  this->owner_of_coordinate = NULL;
2510 
2511  //Throughout the partitioning execution,
2512  //instead of the moving the coordinates, hold a permutation array for parts.
2513  //coordinate_permutations holds the current permutation.
2514  this->coordinate_permutations = allocMemory< mj_lno_t>(this->num_local_coords);
2515  //initial configuration, set each pointer-i to i.
2516 #ifdef HAVE_ZOLTAN2_OMP
2517 #pragma omp parallel for
2518 #endif
2519  for(mj_lno_t i = 0; i < this->num_local_coords; ++i){
2520  this->coordinate_permutations[i] = i;
2521  }
2522 
2523  //new_coordinate_permutations holds the current permutation.
2524  this->new_coordinate_permutations = allocMemory< mj_lno_t>(this->num_local_coords);
2525 
2526  this->assigned_part_ids = NULL;
2527  if(this->num_local_coords > 0){
2528  this->assigned_part_ids = allocMemory<mj_part_t>(this->num_local_coords);
2529  }
2530 
2531  //single partition starts at index-0, and ends at numLocalCoords
2532  //inTotalCounts array holds the end points in coordinate_permutations array
2533  //for each partition. Initially sized 1, and single element is set to numLocalCoords.
2534  this->part_xadj = allocMemory<mj_lno_t>(1);
2535  this->part_xadj[0] = static_cast<mj_lno_t>(this->num_local_coords);//the end of the initial partition is the end of coordinates.
2536  //the ends points of the output, this is allocated later.
2537  this->new_part_xadj = NULL;
2538 
2539  // only store this much if cuts are needed to be stored.
2540  //this->all_cut_coordinates = allocMemory< mj_scalar_t>(this->total_num_cut);
2541 
2542 
2543  this->all_cut_coordinates = allocMemory< mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2544 
2545  this->max_min_coords = allocMemory< mj_scalar_t>(this->num_threads * 2);
2546 
2547  this->process_cut_line_weight_to_put_left = NULL; //how much weight percentage should a MPI put left side of the each cutline
2548  this->thread_cut_line_weight_to_put_left = NULL; //how much weight percentage should each thread in MPI put left side of the each outline
2549  //distribute_points_on_cut_lines = false;
2550  if(this->distribute_points_on_cut_lines){
2551  this->process_cut_line_weight_to_put_left = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2552  this->thread_cut_line_weight_to_put_left = allocMemory<mj_scalar_t *>(this->num_threads);
2553  for(int i = 0; i < this->num_threads; ++i){
2554  this->thread_cut_line_weight_to_put_left[i] = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim);
2555  }
2556  this->process_rectilinear_cut_weight = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim);
2557  this->global_rectilinear_cut_weight = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim);
2558  }
2559 
2560 
2561  // work array to manipulate coordinate of cutlines in different iterations.
2562  //necessary because previous cut line information is used for determining
2563  //the next cutline information. therefore, cannot update the cut work array
2564  //until all cutlines are determined.
2565  this->cut_coordinates_work_array = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim *
2566  this->max_concurrent_part_calculation);
2567 
2568 
2569  //cumulative part weight array.
2570  this->target_part_weights = allocMemory<mj_scalar_t>(
2571  this->max_num_part_along_dim * this->max_concurrent_part_calculation);
2572  // the weight from left to write.
2573 
2574  this->cut_upper_bound_coordinates = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation); //upper bound coordinate of a cut line
2575  this->cut_lower_bound_coordinates = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim* this->max_concurrent_part_calculation); //lower bound coordinate of a cut line
2576  this->cut_lower_bound_weights = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim* this->max_concurrent_part_calculation); //lower bound weight of a cut line
2577  this->cut_upper_bound_weights = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim* this->max_concurrent_part_calculation); //upper bound weight of a cut line
2578 
2579  this->process_local_min_max_coord_total_weight = allocMemory<mj_scalar_t>(3 * this->max_concurrent_part_calculation); //combined array to exchange the min and max coordinate, and total weight of part.
2580  this->global_min_max_coord_total_weight = allocMemory<mj_scalar_t>(3 * this->max_concurrent_part_calculation);//global combined array with the results for min, max and total weight.
2581 
2582  //is_cut_line_determined is used to determine if a cutline is determined already.
2583  //If a cut line is already determined, the next iterations will skip this cut line.
2584  this->is_cut_line_determined = allocMemory<bool>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2585  //my_incomplete_cut_count count holds the number of cutlines that have not been finalized for each part
2586  //when concurrentPartCount>1, using this information, if my_incomplete_cut_count[x]==0, then no work is done for this part.
2587  this->my_incomplete_cut_count = allocMemory<mj_part_t>(this->max_concurrent_part_calculation);
2588  //local part weights of each thread.
2589  this->thread_part_weights = allocMemory<double *>(this->num_threads);
2590  //the work manupulation array for partweights.
2591  this->thread_part_weight_work = allocMemory<double *>(this->num_threads);
2592 
2593  //thread_cut_left_closest_point to hold the closest coordinate to a cutline from left (for each thread).
2594  this->thread_cut_left_closest_point = allocMemory<mj_scalar_t *>(this->num_threads);
2595  //thread_cut_right_closest_point to hold the closest coordinate to a cutline from right (for each thread)
2596  this->thread_cut_right_closest_point = allocMemory<mj_scalar_t *>(this->num_threads);
2597 
2598  //to store how many points in each part a thread has.
2599  this->thread_point_counts = allocMemory<mj_lno_t *>(this->num_threads);
2600 
2601  for(int i = 0; i < this->num_threads; ++i){
2602  //partWeights[i] = allocMemory<mj_scalar_t>(maxTotalPartCount);
2603  this->thread_part_weights[i] = allocMemory < double >(this->max_num_total_part_along_dim * this->max_concurrent_part_calculation);
2604  this->thread_cut_right_closest_point[i] = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2605  this->thread_cut_left_closest_point[i] = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2606  this->thread_point_counts[i] = allocMemory<mj_lno_t>(this->max_num_part_along_dim);
2607  }
2608  //for faster communication, concatanation of
2609  //totalPartWeights sized 2P-1, since there are P parts and P-1 cut lines
2610  //leftClosest distances sized P-1, since P-1 cut lines
2611  //rightClosest distances size P-1, since P-1 cut lines.
2612  this->total_part_weight_left_right_closests = allocMemory<mj_scalar_t>((this->max_num_total_part_along_dim + this->max_num_cut_along_dim * 2) * this->max_concurrent_part_calculation);
2613  this->global_total_part_weight_left_right_closests = allocMemory<mj_scalar_t>((this->max_num_total_part_along_dim + this->max_num_cut_along_dim * 2) * this->max_concurrent_part_calculation);
2614 
2615 
2616  mj_scalar_t **coord = allocMemory<mj_scalar_t *>(this->coord_dim);
2617  for (int i=0; i < this->coord_dim; i++){
2618  coord[i] = allocMemory<mj_scalar_t>(this->num_local_coords);
2619 #ifdef HAVE_ZOLTAN2_OMP
2620 #pragma omp parallel for
2621 #endif
2622  for (mj_lno_t j=0; j < this->num_local_coords; j++)
2623  coord[i][j] = this->mj_coordinates[i][j];
2624  }
2625  this->mj_coordinates = coord;
2626 
2627 
2628  int criteria_dim = (this->num_weights_per_coord ? this->num_weights_per_coord : 1);
2629  mj_scalar_t **weights = allocMemory<mj_scalar_t *>(criteria_dim);
2630 
2631  for (int i=0; i < criteria_dim; i++){
2632  weights[i] = NULL;
2633  }
2634  for (int i=0; i < this->num_weights_per_coord; i++){
2635  weights[i] = allocMemory<mj_scalar_t>(this->num_local_coords);
2636 #ifdef HAVE_ZOLTAN2_OMP
2637 #pragma omp parallel for
2638 #endif
2639  for (mj_lno_t j=0; j < this->num_local_coords; j++)
2640  weights[i][j] = this->mj_weights[i][j];
2641 
2642  }
2643  this->mj_weights = weights;
2644  this->current_mj_gnos = allocMemory<mj_gno_t>(this->num_local_coords);
2645 #ifdef HAVE_ZOLTAN2_OMP
2646 #pragma omp parallel for
2647 #endif
2648  for (mj_lno_t j=0; j < this->num_local_coords; j++)
2649  this->current_mj_gnos[j] = this->initial_mj_gnos[j];
2650 
2651  this->owner_of_coordinate = allocMemory<int>(this->num_local_coords);
2652 
2653 #ifdef HAVE_ZOLTAN2_OMP
2654 #pragma omp parallel for
2655 #endif
2656  for (mj_lno_t j=0; j < this->num_local_coords; j++)
2657  this->owner_of_coordinate[j] = this->myActualRank;
2658 }
2659 
2660 /* \brief compute the global bounding box
2661  */
2662 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2663  typename mj_part_t>
2664 void AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::compute_global_box()
2665 {
2666  //local min coords
2667  mj_scalar_t *mins = allocMemory<mj_scalar_t>(this->coord_dim);
2668  //global min coords
2669  mj_scalar_t *gmins = allocMemory<mj_scalar_t>(this->coord_dim);
2670  //local max coords
2671  mj_scalar_t *maxs = allocMemory<mj_scalar_t>(this->coord_dim);
2672  //global max coords
2673  mj_scalar_t *gmaxs = allocMemory<mj_scalar_t>(this->coord_dim);
2674 
2675  for (int i = 0; i < this->coord_dim; ++i){
2676  mj_scalar_t localMin = std::numeric_limits<mj_scalar_t>::max();
2677  mj_scalar_t localMax = -localMin;
2678  if (localMax > 0) localMax = 0;
2679 
2680 
2681  for (mj_lno_t j = 0; j < this->num_local_coords; ++j){
2682  if (this->mj_coordinates[i][j] < localMin){
2683  localMin = this->mj_coordinates[i][j];
2684  }
2685  if (this->mj_coordinates[i][j] > localMax){
2686  localMax = this->mj_coordinates[i][j];
2687  }
2688  }
2689  //std::cout << " localMin:" << localMin << std::endl;
2690  //std::cout << " localMax:" << localMax << std::endl;
2691  mins[i] = localMin;
2692  maxs[i] = localMax;
2693 
2694  }
2695  reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MIN,
2696  this->coord_dim, mins, gmins
2697  );
2698 
2699 
2700  reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MAX,
2701  this->coord_dim, maxs, gmaxs
2702  );
2703 
2704 
2705 
2706  //create single box with all areas.
2707  global_box = rcp(new mj_partBox_t(0,this->coord_dim,gmins,gmaxs));
2708  //coordinateModelPartBox <mj_scalar_t, mj_part_t> tmpBox (0, coordDim);
2709  freeArray<mj_scalar_t>(mins);
2710  freeArray<mj_scalar_t>(gmins);
2711  freeArray<mj_scalar_t>(maxs);
2712  freeArray<mj_scalar_t>(gmaxs);
2713 }
2714 
2715 /* \brief for part communication we keep track of the box boundaries.
2716  * This is performed when either asked specifically, or when geometric mapping is performed afterwards.
2717  * This function initializes a single box with all global min and max coordinates.
2718  * \param initial_partitioning_boxes the input and output vector for boxes.
2719  */
2720 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2721  typename mj_part_t>
2722 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::init_part_boxes(
2723  RCP<mj_partBoxVector_t> & initial_partitioning_boxes
2724 )
2725 {
2726  mj_partBox_t tmp_box(*global_box);
2727  initial_partitioning_boxes->push_back(tmp_box);
2728 }
2729 
2740 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2741  typename mj_part_t>
2742 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_get_local_min_max_coord_totW(
2743  mj_lno_t coordinate_begin_index,
2744  mj_lno_t coordinate_end_index,
2745  mj_lno_t *mj_current_coordinate_permutations,
2746  mj_scalar_t *mj_current_dim_coords,
2747  mj_scalar_t &min_coordinate,
2748  mj_scalar_t &max_coordinate,
2749  mj_scalar_t &total_weight){
2750 
2751  //if the part is empty.
2752  //set the min and max coordinates as reverse.
2753  if(coordinate_begin_index >= coordinate_end_index)
2754  {
2755  min_coordinate = this->maxScalar_t;
2756  max_coordinate = this->minScalar_t;
2757  total_weight = 0;
2758  }
2759  else {
2760  mj_scalar_t my_total_weight = 0;
2761 #ifdef HAVE_ZOLTAN2_OMP
2762 #pragma omp parallel num_threads(this->num_threads)
2763 #endif
2764  {
2765  //if uniform weights are used, then weight is equal to count.
2766  if (this->mj_uniform_weights[0]) {
2767 #ifdef HAVE_ZOLTAN2_OMP
2768 #pragma omp single
2769 #endif
2770  {
2771  my_total_weight = coordinate_end_index - coordinate_begin_index;
2772  }
2773 
2774  }
2775  else {
2776  //if not uniform, then weights are reducted from threads.
2777 #ifdef HAVE_ZOLTAN2_OMP
2778 #pragma omp for reduction(+:my_total_weight)
2779 #endif
2780  for (mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
2781  int i = mj_current_coordinate_permutations[ii];
2782  my_total_weight += this->mj_weights[0][i];
2783  }
2784  }
2785 
2786  int my_thread_id = 0;
2787 #ifdef HAVE_ZOLTAN2_OMP
2788  my_thread_id = omp_get_thread_num();
2789 #endif
2790  mj_scalar_t my_thread_min_coord, my_thread_max_coord;
2791  my_thread_min_coord=my_thread_max_coord
2792  =mj_current_dim_coords[mj_current_coordinate_permutations[coordinate_begin_index]];
2793 
2794 
2795 #ifdef HAVE_ZOLTAN2_OMP
2796 #pragma omp for
2797 #endif
2798  for(mj_lno_t j = coordinate_begin_index + 1; j < coordinate_end_index; ++j){
2799  int i = mj_current_coordinate_permutations[j];
2800  if(mj_current_dim_coords[i] > my_thread_max_coord)
2801  my_thread_max_coord = mj_current_dim_coords[i];
2802  if(mj_current_dim_coords[i] < my_thread_min_coord)
2803  my_thread_min_coord = mj_current_dim_coords[i];
2804  }
2805  this->max_min_coords[my_thread_id] = my_thread_min_coord;
2806  this->max_min_coords[my_thread_id + this->num_threads] = my_thread_max_coord;
2807 
2808 #ifdef HAVE_ZOLTAN2_OMP
2809 //we need a barrier here, because max_min_array might not be filled by some of the threads.
2810 #pragma omp barrier
2811 #pragma omp single nowait
2812 #endif
2813  {
2814  min_coordinate = this->max_min_coords[0];
2815  for(int i = 1; i < this->num_threads; ++i){
2816  if(this->max_min_coords[i] < min_coordinate)
2817  min_coordinate = this->max_min_coords[i];
2818  }
2819  }
2820 
2821 #ifdef HAVE_ZOLTAN2_OMP
2822 #pragma omp single nowait
2823 #endif
2824  {
2825  max_coordinate = this->max_min_coords[this->num_threads];
2826  for(int i = this->num_threads + 1; i < this->num_threads * 2; ++i){
2827  if(this->max_min_coords[i] > max_coordinate)
2828  max_coordinate = this->max_min_coords[i];
2829  }
2830  }
2831  }
2832  total_weight = my_total_weight;
2833  }
2834 }
2835 
2836 
2844 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2845  typename mj_part_t>
2846 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_get_global_min_max_coord_totW(
2847  mj_part_t current_concurrent_num_parts,
2848  mj_scalar_t *local_min_max_total,
2849  mj_scalar_t *global_min_max_total){
2850 
2851  //reduce min for first current_concurrent_num_parts elements, reduce max for next
2852  //concurrentPartCount elements,
2853  //reduce sum for the last concurrentPartCount elements.
2854  if(this->comm->getSize() > 1){
2856  reductionOp(
2857  current_concurrent_num_parts,
2858  current_concurrent_num_parts,
2859  current_concurrent_num_parts);
2860  try{
2861  reduceAll<int, mj_scalar_t>(
2862  *(this->comm),
2863  reductionOp,
2864  3 * current_concurrent_num_parts,
2865  local_min_max_total,
2866  global_min_max_total);
2867  }
2868  Z2_THROW_OUTSIDE_ERROR(*(this->mj_env))
2869  }
2870  else {
2871  mj_part_t s = 3 * current_concurrent_num_parts;
2872  for (mj_part_t i = 0; i < s; ++i){
2873  global_min_max_total[i] = local_min_max_total[i];
2874  }
2875  }
2876 }
2877 
2878 
2879 
2907 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2908  typename mj_part_t>
2909 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_get_initial_cut_coords_target_weights(
2910  mj_scalar_t min_coord,
2911  mj_scalar_t max_coord,
2912  mj_part_t num_cuts/*p-1*/ ,
2913  mj_scalar_t global_weight,
2914  mj_scalar_t *initial_cut_coords /*p - 1 sized, coordinate of each cut line*/,
2915  mj_scalar_t *current_target_part_weights /*cumulative weights, at left side of each cut line. p-1 sized*/,
2916 
2917  std::vector <mj_part_t> *future_num_part_in_parts, //the vecto
2918  std::vector <mj_part_t> *next_future_num_parts_in_parts,
2919  mj_part_t concurrent_current_part,
2920  mj_part_t obtained_part_index,
2921  mj_part_t num_target_first_level_parts,
2922  const mj_part_t *target_first_level_dist) {
2923 
2924  mj_scalar_t coord_range = max_coord - min_coord;
2925 
2926  // Uniform target weights
2927  if (num_target_first_level_parts <= 1 &&
2928  this->mj_uniform_parts[0]) {
2929  {
2930  mj_part_t cumulative = 0;
2931 
2932  // How many total future parts the part will be partitioned into.
2933  mj_scalar_t total_future_part_count_in_part = mj_scalar_t((*future_num_part_in_parts)[concurrent_current_part]);
2934 
2935  // How much each part should weigh in ideal case.
2936  mj_scalar_t unit_part_weight = global_weight / total_future_part_count_in_part;
2937 
2938  for (mj_part_t i = 0; i < num_cuts; ++i) {
2939  cumulative += (*next_future_num_parts_in_parts)[i + obtained_part_index];
2940 
2941  // Set target part weight.
2942  current_target_part_weights[i] = cumulative * unit_part_weight;
2943 
2944  // Set initial cut coordinate.
2945  initial_cut_coords[i] = min_coord + (coord_range * cumulative) / total_future_part_count_in_part;
2946  }
2947 
2948  current_target_part_weights[num_cuts] = global_weight;
2949  }
2950 
2951  // Round the target part weights.
2952  if (this->mj_uniform_weights[0]) { // Repeated if???
2953  for (mj_part_t i = 0; i < num_cuts + 1; ++i) {
2954  current_target_part_weights[i] = long(current_target_part_weights[i] + 0.5);
2955  }
2956  }
2957  }
2958  // Nonuniform target weights for first level of partitioning
2959  else if(num_target_first_level_parts > 1 &&
2960  target_first_level_dist != NULL) {
2961  {
2962  // Running sum of the total weight
2963  mj_part_t cumulative = 0.0;
2964 
2965  // Sum of entries in the first level partition distribution vector
2966  mj_scalar_t sum_target_first_level_dist = 0.0;
2967 
2968  for (int i = 0; i < num_target_first_level_parts; ++i) {
2969  sum_target_first_level_dist += target_first_level_dist[i];
2970  }
2971 
2972  for (mj_part_t i = 0; i < num_cuts; ++i) {
2973  cumulative += global_weight * target_first_level_dist[i] / sum_target_first_level_dist;
2974 
2975  // Set target part weight.
2976  current_target_part_weights[i] = cumulative;
2977 
2978  // Set initial cut coordinate.
2979  initial_cut_coords[i] = min_coord + (coord_range * cumulative) / global_weight;
2980  }
2981 
2982  current_target_part_weights[num_cuts] = global_weight;
2983  }
2984 
2985  //round the target part weights.
2986  for (mj_part_t i = 0; i < num_cuts + 1; ++i) {
2987  current_target_part_weights[i] = long(current_target_part_weights[i] + 0.5);
2988  }
2989  }
2990  else {
2991  std::cerr << "MJ does not support non uniform part weights beyond the first partition" << std::endl;
2992  exit(1);
2993  }
2994 }
2995 
2996 
3009 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3010  typename mj_part_t>
3011 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::set_initial_coordinate_parts(
3012  mj_scalar_t &max_coordinate,
3013  mj_scalar_t &min_coordinate,
3014  mj_part_t &/* concurrent_current_part_index */,
3015  mj_lno_t coordinate_begin_index,
3016  mj_lno_t coordinate_end_index,
3017  mj_lno_t *mj_current_coordinate_permutations,
3018  mj_scalar_t *mj_current_dim_coords,
3019  mj_part_t *mj_part_ids,
3020  mj_part_t &partition_count
3021 ){
3022  mj_scalar_t coordinate_range = max_coordinate - min_coordinate;
3023 
3024  //if there is single point, or if all points are along a line.
3025  //set initial part to 0 for all.
3026  if(ZOLTAN2_ABS(coordinate_range) < this->sEpsilon ){
3027 #ifdef HAVE_ZOLTAN2_OMP
3028 #pragma omp parallel for
3029 #endif
3030  for(mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
3031  mj_part_ids[mj_current_coordinate_permutations[ii]] = 0;
3032  }
3033  }
3034  else{
3035 
3036  //otherwise estimate an initial part for each coordinate.
3037  //assuming uniform distribution of points.
3038  mj_scalar_t slice = coordinate_range / partition_count;
3039 
3040 #ifdef HAVE_ZOLTAN2_OMP
3041 #pragma omp parallel for
3042 #endif
3043  for(mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
3044 
3045  mj_lno_t iii = mj_current_coordinate_permutations[ii];
3046  mj_part_t pp = mj_part_t((mj_current_dim_coords[iii] - min_coordinate) / slice);
3047  mj_part_ids[iii] = 2 * pp;
3048  }
3049  }
3050 }
3051 
3052 
3063 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3064  typename mj_part_t>
3065 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_1D_part(
3066  mj_scalar_t *mj_current_dim_coords,
3067  double used_imbalance_tolerance,
3068  mj_part_t current_work_part,
3069  mj_part_t current_concurrent_num_parts,
3070  mj_scalar_t *current_cut_coordinates,
3071  mj_part_t total_incomplete_cut_count,
3072  std::vector <mj_part_t> &num_partitioning_in_current_dim
3073 ){
3074 
3075 
3076  mj_part_t rectilinear_cut_count = 0;
3077  mj_scalar_t *temp_cut_coords = current_cut_coordinates;
3078 
3080  *reductionOp = NULL;
3081  reductionOp = new Teuchos::MultiJaggedCombinedReductionOp
3082  <mj_part_t, mj_scalar_t>(
3083  &num_partitioning_in_current_dim ,
3084  current_work_part ,
3085  current_concurrent_num_parts);
3086 
3087  size_t total_reduction_size = 0;
3088 #ifdef HAVE_ZOLTAN2_OMP
3089 #pragma omp parallel shared(total_incomplete_cut_count, rectilinear_cut_count) num_threads(this->num_threads)
3090 #endif
3091  {
3092  int me = 0;
3093 #ifdef HAVE_ZOLTAN2_OMP
3094  me = omp_get_thread_num();
3095 #endif
3096  double *my_thread_part_weights = this->thread_part_weights[me];
3097  mj_scalar_t *my_thread_left_closest = this->thread_cut_left_closest_point[me];
3098  mj_scalar_t *my_thread_right_closest = this->thread_cut_right_closest_point[me];
3099 
3100 #ifdef HAVE_ZOLTAN2_OMP
3101 #pragma omp single
3102 #endif
3103  {
3104  //initialize the lower and upper bounds of the cuts.
3105  mj_part_t next = 0;
3106  for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
3107 
3108  mj_part_t num_part_in_dim = num_partitioning_in_current_dim[current_work_part + i];
3109  mj_part_t num_cut_in_dim = num_part_in_dim - 1;
3110  total_reduction_size += (4 * num_cut_in_dim + 1);
3111 
3112  for(mj_part_t ii = 0; ii < num_cut_in_dim; ++ii){
3113  this->is_cut_line_determined[next] = false;
3114  this->cut_lower_bound_coordinates[next] = global_min_max_coord_total_weight[i]; //min coordinate
3115  this->cut_upper_bound_coordinates[next] = global_min_max_coord_total_weight[i + current_concurrent_num_parts]; //max coordinate
3116 
3117  this->cut_upper_bound_weights[next] = global_min_max_coord_total_weight[i + 2 * current_concurrent_num_parts]; //total weight
3118  this->cut_lower_bound_weights[next] = 0;
3119 
3120  if(this->distribute_points_on_cut_lines){
3121  this->process_cut_line_weight_to_put_left[next] = 0;
3122  }
3123  ++next;
3124  }
3125  }
3126  }
3127 
3128  //no need to have barrier here.
3129  //pragma omp single have implicit barrier.
3130 
3131  int iteration = 0;
3132  while (total_incomplete_cut_count != 0){
3133  iteration += 1;
3134  mj_part_t concurrent_cut_shifts = 0;
3135  size_t total_part_shift = 0;
3136 
3137  for (mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
3138  mj_part_t num_parts = -1;
3139  num_parts = num_partitioning_in_current_dim[current_work_part + kk];
3140 
3141  mj_part_t num_cuts = num_parts - 1;
3142  size_t total_part_count = num_parts + size_t (num_cuts) ;
3143  if (this->my_incomplete_cut_count[kk] > 0){
3144 
3145  //although isDone shared, currentDone is private and same for all.
3146  bool *current_cut_status = this->is_cut_line_determined + concurrent_cut_shifts;
3147  double *my_current_part_weights = my_thread_part_weights + total_part_shift;
3148  mj_scalar_t *my_current_left_closest = my_thread_left_closest + concurrent_cut_shifts;
3149  mj_scalar_t *my_current_right_closest = my_thread_right_closest + concurrent_cut_shifts;
3150 
3151  mj_part_t conccurent_current_part = current_work_part + kk;
3152  mj_lno_t coordinate_begin_index = conccurent_current_part ==0 ? 0: this->part_xadj[conccurent_current_part -1];
3153  mj_lno_t coordinate_end_index = this->part_xadj[conccurent_current_part];
3154  mj_scalar_t *temp_current_cut_coords = temp_cut_coords + concurrent_cut_shifts;
3155 
3156  mj_scalar_t min_coord = global_min_max_coord_total_weight[kk];
3157  mj_scalar_t max_coord = global_min_max_coord_total_weight[kk + current_concurrent_num_parts];
3158 
3159  // compute part weights using existing cuts
3160  this->mj_1D_part_get_thread_part_weights(
3161  total_part_count,
3162  num_cuts,
3163  max_coord,//globalMinMaxTotal[kk + concurrentPartCount],//maxScalar,
3164  min_coord,//globalMinMaxTotal[kk]//minScalar,
3165  coordinate_begin_index,
3166  coordinate_end_index,
3167  mj_current_dim_coords,
3168  temp_current_cut_coords,
3169  current_cut_status,
3170  my_current_part_weights,
3171  my_current_left_closest,
3172  my_current_right_closest);
3173 
3174  }
3175 
3176  concurrent_cut_shifts += num_cuts;
3177  total_part_shift += total_part_count;
3178  }
3179 
3180  //sum up the results of threads
3181  this->mj_accumulate_thread_results(
3182  num_partitioning_in_current_dim,
3183  current_work_part,
3184  current_concurrent_num_parts);
3185 
3186  //now sum up the results of mpi processors.
3187 #ifdef HAVE_ZOLTAN2_OMP
3188 #pragma omp single
3189 #endif
3190  {
3191  if(this->comm->getSize() > 1){
3192  reduceAll<int, mj_scalar_t>( *(this->comm), *reductionOp,
3193  total_reduction_size,
3194  this->total_part_weight_left_right_closests,
3195  this->global_total_part_weight_left_right_closests);
3196 
3197  }
3198  else {
3199  memcpy(
3200  this->global_total_part_weight_left_right_closests,
3201  this->total_part_weight_left_right_closests,
3202  total_reduction_size * sizeof(mj_scalar_t));
3203  }
3204  }
3205 
3206  //how much cut will be shifted for the next part in the concurrent part calculation.
3207  mj_part_t cut_shift = 0;
3208 
3209  //how much the concantaneted array will be shifted for the next part in concurrent part calculation.
3210  size_t tlr_shift = 0;
3211  for (mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
3212  mj_part_t num_parts = num_partitioning_in_current_dim[current_work_part + kk];
3213  mj_part_t num_cuts = num_parts - 1;
3214  size_t num_total_part = num_parts + size_t (num_cuts) ;
3215 
3216  //if the cuts of this cut has already been completed.
3217  //nothing to do for this part.
3218  //just update the shift amount and proceed.
3219  if (this->my_incomplete_cut_count[kk] == 0) {
3220  cut_shift += num_cuts;
3221  tlr_shift += (num_total_part + 2 * num_cuts);
3222  continue;
3223  }
3224 
3225  mj_scalar_t *current_local_part_weights = this->total_part_weight_left_right_closests + tlr_shift ;
3226  mj_scalar_t *current_global_tlr = this->global_total_part_weight_left_right_closests + tlr_shift;
3227  mj_scalar_t *current_global_left_closest_points = current_global_tlr + num_total_part; //left closest points
3228  mj_scalar_t *current_global_right_closest_points = current_global_tlr + num_total_part + num_cuts; //right closest points
3229  mj_scalar_t *current_global_part_weights = current_global_tlr;
3230  bool *current_cut_line_determined = this->is_cut_line_determined + cut_shift;
3231 
3232  mj_scalar_t *current_part_target_weights = this->target_part_weights + cut_shift + kk;
3233  mj_scalar_t *current_part_cut_line_weight_to_put_left = this->process_cut_line_weight_to_put_left + cut_shift;
3234 
3235  mj_scalar_t min_coordinate = global_min_max_coord_total_weight[kk];
3236  mj_scalar_t max_coordinate = global_min_max_coord_total_weight[kk + current_concurrent_num_parts];
3237  mj_scalar_t global_total_weight = global_min_max_coord_total_weight[kk + current_concurrent_num_parts * 2];
3238  mj_scalar_t *current_cut_lower_bound_weights = this->cut_lower_bound_weights + cut_shift;
3239  mj_scalar_t *current_cut_upper_weights = this->cut_upper_bound_weights + cut_shift;
3240  mj_scalar_t *current_cut_upper_bounds = this->cut_upper_bound_coordinates + cut_shift;
3241  mj_scalar_t *current_cut_lower_bounds = this->cut_lower_bound_coordinates + cut_shift;
3242 
3243  mj_part_t initial_incomplete_cut_count = this->my_incomplete_cut_count[kk];
3244 
3245  // Now compute the new cut coordinates.
3246  this->mj_get_new_cut_coordinates(
3247  num_total_part,
3248  num_cuts,
3249  max_coordinate,
3250  min_coordinate,
3251  global_total_weight,
3252  used_imbalance_tolerance,
3253  current_global_part_weights,
3254  current_local_part_weights,
3255  current_part_target_weights,
3256  current_cut_line_determined,
3257  temp_cut_coords + cut_shift,
3258  current_cut_upper_bounds,
3259  current_cut_lower_bounds,
3260  current_global_left_closest_points,
3261  current_global_right_closest_points,
3262  current_cut_lower_bound_weights,
3263  current_cut_upper_weights,
3264  this->cut_coordinates_work_array +cut_shift, //new cut coordinates
3265  current_part_cut_line_weight_to_put_left,
3266  &rectilinear_cut_count,
3267  this->my_incomplete_cut_count[kk]);
3268 
3269  cut_shift += num_cuts;
3270  tlr_shift += (num_total_part + 2 * num_cuts);
3271  mj_part_t iteration_complete_cut_count = initial_incomplete_cut_count - this->my_incomplete_cut_count[kk];
3272 #ifdef HAVE_ZOLTAN2_OMP
3273 #pragma omp single
3274 #endif
3275  {
3276  total_incomplete_cut_count -= iteration_complete_cut_count;
3277  }
3278 
3279  }
3280  { //This unnecessary bracket works around a compiler bug in NVCC when compiling with OpenMP enabled
3281 #ifdef HAVE_ZOLTAN2_OMP
3282 #pragma omp barrier
3283 #pragma omp single
3284 #endif
3285  {
3286  //swap the cut coordinates for next iteration.
3287  mj_scalar_t *t = temp_cut_coords;
3288  temp_cut_coords = this->cut_coordinates_work_array;
3289  this->cut_coordinates_work_array = t;
3290  }
3291  }
3292  }
3293 
3294  //if (myRank == 0)
3295  //std::cout << "iteration:" << iteration << " partition:" << num_partitioning_in_current_dim[current_work_part] << std::endl;
3296  // Needed only if keep_cuts; otherwise can simply swap array pointers
3297  // cutCoordinates and cutCoordinatesWork.
3298  // (at first iteration, cutCoordinates == cutCoorindates_tmp).
3299  // computed cuts must be in cutCoordinates.
3300  if (current_cut_coordinates != temp_cut_coords){
3301 #ifdef HAVE_ZOLTAN2_OMP
3302 #pragma omp single
3303 #endif
3304  {
3305  mj_part_t next = 0;
3306  for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
3307  mj_part_t num_parts = -1;
3308  num_parts = num_partitioning_in_current_dim[current_work_part + i];
3309  mj_part_t num_cuts = num_parts - 1;
3310 
3311  for(mj_part_t ii = 0; ii < num_cuts; ++ii){
3312  current_cut_coordinates[next + ii] = temp_cut_coords[next + ii];
3313  }
3314  next += num_cuts;
3315  }
3316  }
3317 
3318 #ifdef HAVE_ZOLTAN2_OMP
3319 #pragma omp single
3320 #endif
3321  {
3322  this->cut_coordinates_work_array = temp_cut_coords;
3323  }
3324  }
3325  }
3326  delete reductionOp;
3327 }
3328 
3329 
3349 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3350  typename mj_part_t>
3351 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_1D_part_get_thread_part_weights(
3352  size_t total_part_count,
3353  mj_part_t num_cuts,
3354  mj_scalar_t max_coord,
3355  mj_scalar_t min_coord,
3356  mj_lno_t coordinate_begin_index,
3357  mj_lno_t coordinate_end_index,
3358  mj_scalar_t *mj_current_dim_coords,
3359  mj_scalar_t *temp_current_cut_coords,
3360  bool * /* current_cut_status */,
3361  double *my_current_part_weights,
3362  mj_scalar_t *my_current_left_closest,
3363  mj_scalar_t *my_current_right_closest){
3364 
3365  // initializations for part weights, left/right closest
3366  for (size_t i = 0; i < total_part_count; ++i){
3367  my_current_part_weights[i] = 0;
3368  }
3369 
3370  //initialize the left and right closest coordinates
3371  //to their max value.
3372  for(mj_part_t i = 0; i < num_cuts; ++i){
3373  my_current_left_closest[i] = min_coord - 1;
3374  my_current_right_closest[i] = max_coord + 1;
3375  }
3376  //mj_lno_t comparison_count = 0;
3377  mj_scalar_t minus_EPSILON = -this->sEpsilon;
3378 #ifdef HAVE_ZOLTAN2_OMP
3379  //no need for the barrier as all threads uses their local memories.
3380  //dont change the static scheduling here, as it is assumed when the new
3381  //partitions are created later.
3382 #pragma omp for
3383 #endif
3384  for (mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
3385  int i = this->coordinate_permutations[ii];
3386 
3387  //the accesses to assigned_part_ids are thread safe
3388  //since each coordinate is assigned to only a single thread.
3389  mj_part_t j = this->assigned_part_ids[i] / 2;
3390 
3391  if(j >= num_cuts){
3392  j = num_cuts - 1;
3393  }
3394 
3395  mj_part_t lower_cut_index = 0;
3396  mj_part_t upper_cut_index = num_cuts - 1;
3397 
3398  mj_scalar_t w = this->mj_uniform_weights[0]? 1:this->mj_weights[0][i];
3399  bool is_inserted = false;
3400  bool is_on_left_of_cut = false;
3401  bool is_on_right_of_cut = false;
3402  mj_part_t last_compared_part = -1;
3403 
3404  mj_scalar_t coord = mj_current_dim_coords[i];
3405 
3406  while(upper_cut_index >= lower_cut_index)
3407  {
3408  //comparison_count++;
3409  last_compared_part = -1;
3410  is_on_left_of_cut = false;
3411  is_on_right_of_cut = false;
3412  mj_scalar_t cut = temp_current_cut_coords[j];
3413  mj_scalar_t distance_to_cut = coord - cut;
3414  mj_scalar_t abs_distance_to_cut = ZOLTAN2_ABS(distance_to_cut);
3415 
3416  //if it is on the line.
3417  if(abs_distance_to_cut < this->sEpsilon){
3418 
3419  my_current_part_weights[j * 2 + 1] += w;
3420  this->assigned_part_ids[i] = j * 2 + 1;
3421 
3422  //assign left and right closest point to cut as the point is on the cut.
3423  my_current_left_closest[j] = coord;
3424  my_current_right_closest[j] = coord;
3425  //now we need to check if there are other cuts on the same cut coordinate.
3426  //if there are, then we add the weight of the cut to all cuts in the same coordinate.
3427  mj_part_t kk = j + 1;
3428  while(kk < num_cuts){
3429  // Needed when cuts shared the same position
3430  distance_to_cut =ZOLTAN2_ABS(temp_current_cut_coords[kk] - cut);
3431  if(distance_to_cut < this->sEpsilon){
3432  my_current_part_weights[2 * kk + 1] += w;
3433  my_current_left_closest[kk] = coord;
3434  my_current_right_closest[kk] = coord;
3435  kk++;
3436  }
3437  else{
3438  //cut is far away.
3439  //just check the left closest point for the next cut.
3440  if(coord - my_current_left_closest[kk] > this->sEpsilon){
3441  my_current_left_closest[kk] = coord;
3442  }
3443  break;
3444  }
3445  }
3446 
3447 
3448  kk = j - 1;
3449  //continue checking for the cuts on the left if they share the same coordinate.
3450  while(kk >= 0){
3451  distance_to_cut =ZOLTAN2_ABS(temp_current_cut_coords[kk] - cut);
3452  if(distance_to_cut < this->sEpsilon){
3453  my_current_part_weights[2 * kk + 1] += w;
3454  //try to write the partId as the leftmost cut.
3455  this->assigned_part_ids[i] = kk * 2 + 1;
3456  my_current_left_closest[kk] = coord;
3457  my_current_right_closest[kk] = coord;
3458  kk--;
3459  }
3460  else{
3461  //if cut is far away on the left of the point.
3462  //then just compare for right closest point.
3463  if(my_current_right_closest[kk] - coord > this->sEpsilon){
3464  my_current_right_closest[kk] = coord;
3465  }
3466  break;
3467  }
3468  }
3469 
3470  is_inserted = true;
3471  break;
3472  }
3473  else {
3474  //if point is on the left of the cut.
3475  if (distance_to_cut < 0) {
3476  bool _break = false;
3477  if(j > 0){
3478  //check distance to the cut on the left the current cut compared.
3479  //if point is on the right, then we find the part of the point.
3480  mj_scalar_t distance_to_next_cut = coord - temp_current_cut_coords[j - 1];
3481  if(distance_to_next_cut > this->sEpsilon){
3482  _break = true;
3483  }
3484  }
3485  //if point is not on the right of the next cut, then
3486  //set the upper bound to this cut.
3487  upper_cut_index = j - 1;
3488  //set the last part, and mark it as on the left of the last part.
3489  is_on_left_of_cut = true;
3490  last_compared_part = j;
3491  if(_break) break;
3492  }
3493  else {
3494  //if point is on the right of the cut.
3495  bool _break = false;
3496  if(j < num_cuts - 1){
3497  //check distance to the cut on the left the current cut compared.
3498  //if point is on the right, then we find the part of the point.
3499  mj_scalar_t distance_to_next_cut = coord - temp_current_cut_coords[j + 1];
3500  if(distance_to_next_cut < minus_EPSILON){
3501  _break = true;
3502  }
3503  }
3504 
3505  //if point is not on the left of the next cut, then
3506  //set the upper bound to this cut.
3507  lower_cut_index = j + 1;
3508  //set the last part, and mark it as on the right of the last part.
3509  is_on_right_of_cut = true;
3510  last_compared_part = j;
3511  if(_break) break;
3512  }
3513  }
3514 
3515  j = (upper_cut_index + lower_cut_index) / 2;
3516  }
3517  if(!is_inserted){
3518  if(is_on_right_of_cut){
3519 
3520  //add it to the right of the last compared part.
3521  my_current_part_weights[2 * last_compared_part + 2] += w;
3522  this->assigned_part_ids[i] = 2 * last_compared_part + 2;
3523 
3524  //update the right closest point of last compared cut.
3525  if(my_current_right_closest[last_compared_part] - coord > this->sEpsilon){
3526  my_current_right_closest[last_compared_part] = coord;
3527  }
3528  //update the left closest point of the cut on the right of the last compared cut.
3529  if(last_compared_part+1 < num_cuts){
3530 
3531  if(coord - my_current_left_closest[last_compared_part + 1] > this->sEpsilon){
3532  my_current_left_closest[last_compared_part + 1] = coord;
3533  }
3534  }
3535 
3536  }
3537  else if(is_on_left_of_cut){
3538 
3539  //add it to the left of the last compared part.
3540  my_current_part_weights[2 * last_compared_part] += w;
3541  this->assigned_part_ids[i] = 2 * last_compared_part;
3542 
3543 
3544  //update the left closest point of last compared cut.
3545  if(coord - my_current_left_closest[last_compared_part] > this->sEpsilon){
3546  my_current_left_closest[last_compared_part] = coord;
3547  }
3548 
3549  //update the right closest point of the cut on the left of the last compared cut.
3550  if(last_compared_part-1 >= 0){
3551  if(my_current_right_closest[last_compared_part -1] - coord > this->sEpsilon){
3552  my_current_right_closest[last_compared_part -1] = coord;
3553  }
3554  }
3555  }
3556  }
3557  }
3558 
3559  // prefix sum computation.
3560  //we need prefix sum for each part to determine cut positions.
3561  for (size_t i = 1; i < total_part_count; ++i){
3562  // check for cuts sharing the same position; all cuts sharing a position
3563  // have the same weight == total weight for all cuts sharing the position.
3564  // don't want to accumulate that total weight more than once.
3565  if(i % 2 == 0 && i > 1 && i < total_part_count - 1 &&
3566  ZOLTAN2_ABS(temp_current_cut_coords[i / 2] - temp_current_cut_coords[i /2 - 1])
3567  < this->sEpsilon){
3568  //i % 2 = 0 when part i represents the cut coordinate.
3569  //if it is a cut, and if the next cut also have the same coordinate, then
3570  //dont addup.
3571  my_current_part_weights[i] = my_current_part_weights[i-2];
3572  continue;
3573  }
3574  //otherwise do the prefix sum.
3575  my_current_part_weights[i] += my_current_part_weights[i-1];
3576  }
3577 }
3578 
3579 
3587 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3588  typename mj_part_t>
3589 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_accumulate_thread_results(
3590  const std::vector <mj_part_t> &num_partitioning_in_current_dim,
3591  mj_part_t current_work_part,
3592  mj_part_t current_concurrent_num_parts){
3593 
3594 #ifdef HAVE_ZOLTAN2_OMP
3595  //needs barrier here, as it requires all threads to finish mj_1D_part_get_thread_part_weights
3596  //using parallel region here reduces the performance because of the cache invalidates.
3597 #pragma omp barrier
3598 #pragma omp single
3599 #endif
3600  {
3601  size_t tlr_array_shift = 0;
3602  mj_part_t cut_shift = 0;
3603 
3604  //iterate for all concurrent parts to find the left and right closest points in the process.
3605  for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
3606 
3607  mj_part_t num_parts_in_part = num_partitioning_in_current_dim[current_work_part + i];
3608  mj_part_t num_cuts_in_part = num_parts_in_part - 1;
3609  size_t num_total_part_in_part = num_parts_in_part + size_t (num_cuts_in_part) ;
3610 
3611  //iterate for cuts in a single part.
3612  for(mj_part_t ii = 0; ii < num_cuts_in_part ; ++ii){
3613  mj_part_t next = tlr_array_shift + ii;
3614  mj_part_t cut_index = cut_shift + ii;
3615  if(this->is_cut_line_determined[cut_index]) continue;
3616  mj_scalar_t left_closest_in_process = this->thread_cut_left_closest_point[0][cut_index],
3617  right_closest_in_process = this->thread_cut_right_closest_point[0][cut_index];
3618 
3619  //find the closest points from left and right for the cut in the process.
3620  for (int j = 1; j < this->num_threads; ++j){
3621  if (this->thread_cut_right_closest_point[j][cut_index] < right_closest_in_process ){
3622  right_closest_in_process = this->thread_cut_right_closest_point[j][cut_index];
3623  }
3624  if (this->thread_cut_left_closest_point[j][cut_index] > left_closest_in_process ){
3625  left_closest_in_process = this->thread_cut_left_closest_point[j][cut_index];
3626  }
3627  }
3628  //store the left and right closes points.
3629  this->total_part_weight_left_right_closests[num_total_part_in_part +
3630  next] = left_closest_in_process;
3631  this->total_part_weight_left_right_closests[num_total_part_in_part +
3632  num_cuts_in_part + next] = right_closest_in_process;
3633  }
3634  //set the shift position in the arrays
3635  tlr_array_shift += (num_total_part_in_part + 2 * num_cuts_in_part);
3636  cut_shift += num_cuts_in_part;
3637  }
3638 
3639  tlr_array_shift = 0;
3640  cut_shift = 0;
3641  size_t total_part_array_shift = 0;
3642 
3643  //iterate for all concurrent parts to find the total weight in the process.
3644  for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
3645 
3646  mj_part_t num_parts_in_part = num_partitioning_in_current_dim[current_work_part + i];
3647  mj_part_t num_cuts_in_part = num_parts_in_part - 1;
3648  size_t num_total_part_in_part = num_parts_in_part + size_t (num_cuts_in_part) ;
3649 
3650  for(size_t j = 0; j < num_total_part_in_part; ++j){
3651 
3652  mj_part_t cut_ind = j / 2 + cut_shift;
3653 
3654  //need to check j != num_total_part_in_part - 1
3655  // which is same as j/2 != num_cuts_in_part.
3656  //we cannot check it using cut_ind, because of the concurrent part concantanetion.
3657  if(j != num_total_part_in_part - 1 && this->is_cut_line_determined[cut_ind]) continue;
3658  double pwj = 0;
3659  for (int k = 0; k < this->num_threads; ++k){
3660  pwj += this->thread_part_weights[k][total_part_array_shift + j];
3661  }
3662  //size_t jshift = j % total_part_count + i * (total_part_count + 2 * noCuts);
3663  this->total_part_weight_left_right_closests[tlr_array_shift + j] = pwj;
3664  }
3665  cut_shift += num_cuts_in_part;
3666  tlr_array_shift += num_total_part_in_part + 2 * num_cuts_in_part;
3667  total_part_array_shift += num_total_part_in_part;
3668  }
3669  }
3670  //the other threads needs to wait here.
3671  //but we don't need a pragma omp barrier.
3672  //as omp single has already have implicit barrier.
3673 }
3674 
3675 
3685 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3686  typename mj_part_t>
3687 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_calculate_new_cut_position (
3688  mj_scalar_t cut_upper_bound,
3689  mj_scalar_t cut_lower_bound,
3690  mj_scalar_t cut_upper_weight,
3691  mj_scalar_t cut_lower_weight,
3692  mj_scalar_t expected_weight,
3693  mj_scalar_t &new_cut_position){
3694 
3695  if(ZOLTAN2_ABS(cut_upper_bound - cut_lower_bound) < this->sEpsilon){
3696  new_cut_position = cut_upper_bound; //or lower bound does not matter.
3697  }
3698 
3699 
3700  if(ZOLTAN2_ABS(cut_upper_weight - cut_lower_weight) < this->sEpsilon){
3701  new_cut_position = cut_lower_bound;
3702  }
3703 
3704  mj_scalar_t coordinate_range = (cut_upper_bound - cut_lower_bound);
3705  mj_scalar_t weight_range = (cut_upper_weight - cut_lower_weight);
3706  mj_scalar_t my_weight_diff = (expected_weight - cut_lower_weight);
3707 
3708  mj_scalar_t required_shift = (my_weight_diff / weight_range);
3709  int scale_constant = 20;
3710  int shiftint= int (required_shift * scale_constant);
3711  if (shiftint == 0) shiftint = 1;
3712  required_shift = mj_scalar_t (shiftint) / scale_constant;
3713  new_cut_position = coordinate_range * required_shift + cut_lower_bound;
3714 }
3715 
3716 
3727 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3728  typename mj_part_t>
3729 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_create_new_partitions(
3730  mj_part_t num_parts,
3731  mj_scalar_t * /* mj_current_dim_coords */,
3732  mj_scalar_t *current_concurrent_cut_coordinate,
3733  mj_lno_t coordinate_begin,
3734  mj_lno_t coordinate_end,
3735  mj_scalar_t *used_local_cut_line_weight_to_left,
3736  double **used_thread_part_weight_work,
3737  mj_lno_t *out_part_xadj){
3738 
3739  mj_part_t num_cuts = num_parts - 1;
3740 
3741 #ifdef HAVE_ZOLTAN2_OMP
3742 #pragma omp parallel
3743 #endif
3744  {
3745  int me = 0;
3746 #ifdef HAVE_ZOLTAN2_OMP
3747  me = omp_get_thread_num();
3748 #endif
3749 
3750  mj_lno_t *thread_num_points_in_parts = this->thread_point_counts[me];
3751  mj_scalar_t *my_local_thread_cut_weights_to_put_left = NULL;
3752 
3753  //now if the rectilinear partitioning is allowed we decide how
3754  //much weight each thread should put to left and right.
3755  if (this->distribute_points_on_cut_lines){
3756  my_local_thread_cut_weights_to_put_left = this->thread_cut_line_weight_to_put_left[me];
3757  // this for assumes the static scheduling in mj_1D_part calculation.
3758 #ifdef HAVE_ZOLTAN2_OMP
3759 #pragma omp for
3760 #endif
3761  for (mj_part_t i = 0; i < num_cuts; ++i){
3762  //the left to be put on the left of the cut.
3763  mj_scalar_t left_weight = used_local_cut_line_weight_to_left[i];
3764  for(int ii = 0; ii < this->num_threads; ++ii){
3765  if(left_weight > this->sEpsilon){
3766  //the weight of thread ii on cut.
3767  mj_scalar_t thread_ii_weight_on_cut = used_thread_part_weight_work[ii][i * 2 + 1] - used_thread_part_weight_work[ii][i * 2 ];
3768  if(thread_ii_weight_on_cut < left_weight){
3769  //if left weight is bigger than threads weight on cut.
3770  this->thread_cut_line_weight_to_put_left[ii][i] = thread_ii_weight_on_cut;
3771  }
3772  else {
3773  //if thread's weight is bigger than space, then put only a portion.
3774  this->thread_cut_line_weight_to_put_left[ii][i] = left_weight ;
3775  }
3776  left_weight -= thread_ii_weight_on_cut;
3777  }
3778  else {
3779  this->thread_cut_line_weight_to_put_left[ii][i] = 0;
3780  }
3781  }
3782  }
3783 
3784  if(num_cuts > 0){
3785  //this is a special case. If cutlines share the same coordinate, their weights are equal.
3786  //we need to adjust the ratio for that.
3787  for (mj_part_t i = num_cuts - 1; i > 0 ; --i){
3788  if(ZOLTAN2_ABS(current_concurrent_cut_coordinate[i] - current_concurrent_cut_coordinate[i -1]) < this->sEpsilon){
3789  my_local_thread_cut_weights_to_put_left[i] -= my_local_thread_cut_weights_to_put_left[i - 1] ;
3790  }
3791  my_local_thread_cut_weights_to_put_left[i] = static_cast<long long>((my_local_thread_cut_weights_to_put_left[i] + LEAST_SIGNIFICANCE) * SIGNIFICANCE_MUL)
3792  / mj_scalar_t(SIGNIFICANCE_MUL);
3793  }
3794  }
3795  }
3796 
3797  for(mj_part_t ii = 0; ii < num_parts; ++ii){
3798  thread_num_points_in_parts[ii] = 0;
3799  }
3800 
3801 
3802 #ifdef HAVE_ZOLTAN2_OMP
3803  //dont change static scheduler. the static partitioner used later as well.
3804 #pragma omp for
3805 #endif
3806  for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
3807 
3808  mj_lno_t coordinate_index = this->coordinate_permutations[ii];
3809  mj_scalar_t coordinate_weight = this->mj_uniform_weights[0]? 1:this->mj_weights[0][coordinate_index];
3810  mj_part_t coordinate_assigned_place = this->assigned_part_ids[coordinate_index];
3811  mj_part_t coordinate_assigned_part = coordinate_assigned_place / 2;
3812  if(coordinate_assigned_place % 2 == 1){
3813  //if it is on the cut.
3814  if(this->distribute_points_on_cut_lines
3815  && my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] > this->sEpsilon){
3816  //if the rectilinear partitioning is allowed,
3817  //and the thread has still space to put on the left of the cut
3818  //then thread puts the vertex to left.
3819  my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] -= coordinate_weight;
3820  //if putting the vertex to left increased the weight more than expected.
3821  //and if the next cut is on the same coordinate,
3822  //then we need to adjust how much weight next cut puts to its left as well,
3823  //in order to take care of the imbalance.
3824  if(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] < 0
3825  && coordinate_assigned_part < num_cuts - 1
3826  && ZOLTAN2_ABS(current_concurrent_cut_coordinate[coordinate_assigned_part+1] -
3827  current_concurrent_cut_coordinate[coordinate_assigned_part]) < this->sEpsilon){
3828  my_local_thread_cut_weights_to_put_left[coordinate_assigned_part + 1] += my_local_thread_cut_weights_to_put_left[coordinate_assigned_part];
3829  }
3830  ++thread_num_points_in_parts[coordinate_assigned_part];
3831  this->assigned_part_ids[coordinate_index] = coordinate_assigned_part;
3832  }
3833  else{
3834  //if there is no more space on the left, put the coordinate to the right of the cut.
3835  ++coordinate_assigned_part;
3836  //this while loop is necessary when a line is partitioned into more than 2 parts.
3837  while(this->distribute_points_on_cut_lines &&
3838  coordinate_assigned_part < num_cuts){
3839  //traverse all the cut lines having the same partitiong
3840  if(ZOLTAN2_ABS(current_concurrent_cut_coordinate[coordinate_assigned_part] -
3841  current_concurrent_cut_coordinate[coordinate_assigned_part - 1])
3842  < this->sEpsilon){
3843  //if line has enough space on left, put it there.
3844  if(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] >
3845  this->sEpsilon &&
3846  my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] >=
3847  ZOLTAN2_ABS(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] - coordinate_weight)){
3848  my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] -= coordinate_weight;
3849  //Again if it put too much on left of the cut,
3850  //update how much the next cut sharing the same coordinate will put to its left.
3851  if(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] < 0 &&
3852  coordinate_assigned_part < num_cuts - 1 &&
3853  ZOLTAN2_ABS(current_concurrent_cut_coordinate[coordinate_assigned_part+1] -
3854  current_concurrent_cut_coordinate[coordinate_assigned_part]) < this->sEpsilon){
3855  my_local_thread_cut_weights_to_put_left[coordinate_assigned_part + 1] += my_local_thread_cut_weights_to_put_left[coordinate_assigned_part];
3856  }
3857  break;
3858  }
3859  }
3860  else {
3861  break;
3862  }
3863  ++coordinate_assigned_part;
3864  }
3865  ++thread_num_points_in_parts[coordinate_assigned_part];
3866  this->assigned_part_ids[coordinate_index] = coordinate_assigned_part;
3867  }
3868  }
3869  else {
3870  //if it is already assigned to a part, then just put it to the corresponding part.
3871  ++thread_num_points_in_parts[coordinate_assigned_part];
3872  this->assigned_part_ids[coordinate_index] = coordinate_assigned_part;
3873  }
3874  }
3875 
3876 
3877 
3878  //now we calculate where each thread will write in new_coordinate_permutations array.
3879  //first we find the out_part_xadj, by marking the begin and end points of each part found.
3880  //the below loop find the number of points in each part, and writes it to out_part_xadj
3881 #ifdef HAVE_ZOLTAN2_OMP
3882 #pragma omp for
3883 #endif
3884  for(mj_part_t j = 0; j < num_parts; ++j){
3885  mj_lno_t num_points_in_part_j_upto_thread_i = 0;
3886  for (int i = 0; i < this->num_threads; ++i){
3887  mj_lno_t thread_num_points_in_part_j = this->thread_point_counts[i][j];
3888  //prefix sum to thread point counts, so that each will have private space to write.
3889  this->thread_point_counts[i][j] = num_points_in_part_j_upto_thread_i;
3890  num_points_in_part_j_upto_thread_i += thread_num_points_in_part_j;
3891 
3892  }
3893  out_part_xadj[j] = num_points_in_part_j_upto_thread_i;// + prev2; //+ coordinateBegin;
3894  }
3895 
3896  //now we need to do a prefix sum to out_part_xadj[j], to point begin and end of each part.
3897 #ifdef HAVE_ZOLTAN2_OMP
3898 #pragma omp single
3899 #endif
3900  {
3901  //perform prefix sum for num_points in parts.
3902  for(mj_part_t j = 1; j < num_parts; ++j){
3903  out_part_xadj[j] += out_part_xadj[j - 1];
3904  }
3905  }
3906 
3907  //shift the num points in threads thread to obtain the
3908  //beginning index of each thread's private space.
3909  for(mj_part_t j = 1; j < num_parts; ++j){
3910  thread_num_points_in_parts[j] += out_part_xadj[j - 1] ;
3911  }
3912 
3913 
3914  //now thread gets the coordinate and writes the index of coordinate to the permutation array
3915  //using the part index we calculated.
3916 #ifdef HAVE_ZOLTAN2_OMP
3917 #pragma omp for
3918 #endif
3919  for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
3920  mj_lno_t i = this->coordinate_permutations[ii];
3921  mj_part_t p = this->assigned_part_ids[i];
3922  this->new_coordinate_permutations[coordinate_begin +
3923  thread_num_points_in_parts[p]++] = i;
3924  }
3925  }
3926 }
3927 
3928 
3929 
3958 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3959  typename mj_part_t>
3960 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_get_new_cut_coordinates(
3961  const size_t &/* num_total_part */,
3962  const mj_part_t &num_cuts,
3963  const mj_scalar_t &max_coordinate,
3964  const mj_scalar_t &min_coordinate,
3965  const mj_scalar_t &global_total_weight,
3966  const double &used_imbalance_tolerance,
3967  mj_scalar_t * current_global_part_weights,
3968  const mj_scalar_t * current_local_part_weights,
3969  const mj_scalar_t *current_part_target_weights,
3970  bool *current_cut_line_determined,
3971  mj_scalar_t *current_cut_coordinates,
3972  mj_scalar_t *current_cut_upper_bounds,
3973  mj_scalar_t *current_cut_lower_bounds,
3974  mj_scalar_t *current_global_left_closest_points,
3975  mj_scalar_t *current_global_right_closest_points,
3976  mj_scalar_t * current_cut_lower_bound_weights,
3977  mj_scalar_t * current_cut_upper_weights,
3978  mj_scalar_t *new_current_cut_coordinates,
3979  mj_scalar_t *current_part_cut_line_weight_to_put_left,
3980  mj_part_t *rectilinear_cut_count,
3981  mj_part_t &my_num_incomplete_cut){
3982 
3983  //seen weight in the part
3984  mj_scalar_t seen_weight_in_part = 0;
3985  //expected weight for part.
3986  mj_scalar_t expected_weight_in_part = 0;
3987  //imbalance for the left and right side of the cut.
3988  double imbalance_on_left = 0, imbalance_on_right = 0;
3989 
3990 
3991 #ifdef HAVE_ZOLTAN2_OMP
3992 #pragma omp for
3993 #endif
3994  for (mj_part_t i = 0; i < num_cuts; i++){
3995  //if left and right closest points are not set yet,
3996  //set it to the cut itself.
3997  if(min_coordinate - current_global_left_closest_points[i] > this->sEpsilon)
3998  current_global_left_closest_points[i] = current_cut_coordinates[i];
3999  if(current_global_right_closest_points[i] - max_coordinate > this->sEpsilon)
4000  current_global_right_closest_points[i] = current_cut_coordinates[i];
4001 
4002  }
4003 #ifdef HAVE_ZOLTAN2_OMP
4004 #pragma omp for
4005 #endif
4006  for (mj_part_t i = 0; i < num_cuts; i++){
4007 
4008  if(this->distribute_points_on_cut_lines){
4009  //init the weight on the cut.
4010  this->global_rectilinear_cut_weight[i] = 0;
4011  this->process_rectilinear_cut_weight[i] = 0;
4012  }
4013  //if already determined at previous iterations,
4014  //then just write the coordinate to new array, and proceed.
4015  if(current_cut_line_determined[i]) {
4016  new_current_cut_coordinates[i] = current_cut_coordinates[i];
4017  continue;
4018  }
4019 
4020  //current weight of the part at the left of the cut line.
4021  seen_weight_in_part = current_global_part_weights[i * 2];
4022 
4023  /*
4024  std::cout << "seen_weight_in_part:" << i << " is "<< seen_weight_in_part <<std::endl;
4025  std::cout << "\tcut:" << current_cut_coordinates[i]
4026  << " current_cut_lower_bounds:" << current_cut_lower_bounds[i]
4027  << " current_cut_upper_bounds:" << current_cut_upper_bounds[i] << std::endl;
4028  */
4029  //expected ratio
4030  expected_weight_in_part = current_part_target_weights[i];
4031  //leftImbalance = imbalanceOf(seenW, globalTotalWeight, expected);
4032  imbalance_on_left = imbalanceOf2(seen_weight_in_part, expected_weight_in_part);
4033  //rightImbalance = imbalanceOf(globalTotalWeight - seenW, globalTotalWeight, 1 - expected);
4034  imbalance_on_right = imbalanceOf2(global_total_weight - seen_weight_in_part, global_total_weight - expected_weight_in_part);
4035 
4036  bool is_left_imbalance_valid = ZOLTAN2_ABS(imbalance_on_left) - used_imbalance_tolerance < this->sEpsilon ;
4037  bool is_right_imbalance_valid = ZOLTAN2_ABS(imbalance_on_right) - used_imbalance_tolerance < this->sEpsilon;
4038 
4039  //if the cut line reaches to desired imbalance.
4040  if(is_left_imbalance_valid && is_right_imbalance_valid){
4041  current_cut_line_determined[i] = true;
4042 #ifdef HAVE_ZOLTAN2_OMP
4043 #pragma omp atomic
4044 #endif
4045  my_num_incomplete_cut -= 1;
4046  new_current_cut_coordinates [i] = current_cut_coordinates[i];
4047  continue;
4048  }
4049  else if(imbalance_on_left < 0){
4050  //if left imbalance < 0 then we need to move the cut to right.
4051 
4052  if(this->distribute_points_on_cut_lines){
4053  //if it is okay to distribute the coordinate on
4054  //the same coordinate to left and right.
4055  //then check if we can reach to the target weight by including the
4056  //coordinates in the part.
4057  if (current_global_part_weights[i * 2 + 1] == expected_weight_in_part){
4058  //if it is we are done.
4059  current_cut_line_determined[i] = true;
4060 #ifdef HAVE_ZOLTAN2_OMP
4061 #pragma omp atomic
4062 #endif
4063  my_num_incomplete_cut -= 1;
4064 
4065  //then assign everything on the cut to the left of the cut.
4066  new_current_cut_coordinates [i] = current_cut_coordinates[i];
4067 
4068  //for this cut all the weight on cut will be put to left.
4069 
4070  current_part_cut_line_weight_to_put_left[i] = current_local_part_weights[i * 2 + 1] - current_local_part_weights[i * 2];
4071  continue;
4072  }
4073  else if (current_global_part_weights[i * 2 + 1] > expected_weight_in_part){
4074 
4075  //if the weight is larger than the expected weight,
4076  //then we need to distribute some points to left, some to right.
4077  current_cut_line_determined[i] = true;
4078 #ifdef HAVE_ZOLTAN2_OMP
4079 #pragma omp atomic
4080 #endif
4081  *rectilinear_cut_count += 1;
4082  //increase the num cuts to be determined with rectilinear partitioning.
4083 
4084 #ifdef HAVE_ZOLTAN2_OMP
4085 #pragma omp atomic
4086 #endif
4087  my_num_incomplete_cut -= 1;
4088  new_current_cut_coordinates [i] = current_cut_coordinates[i];
4089  this->process_rectilinear_cut_weight[i] = current_local_part_weights[i * 2 + 1] -
4090  current_local_part_weights[i * 2];
4091  continue;
4092  }
4093  }
4094  //we need to move further right,so set lower bound to current line, and shift it to the closes point from right.
4095  current_cut_lower_bounds[i] = current_global_right_closest_points[i];
4096  //set the lower bound weight to the weight we have seen.
4097  current_cut_lower_bound_weights[i] = seen_weight_in_part;
4098 
4099  //compare the upper bound with what has been found in the last iteration.
4100  //we try to make more strict bounds for the cut here.
4101  for (mj_part_t ii = i + 1; ii < num_cuts ; ++ii){
4102  mj_scalar_t p_weight = current_global_part_weights[ii * 2];
4103  mj_scalar_t line_weight = current_global_part_weights[ii * 2 + 1];
4104 
4105  if(p_weight >= expected_weight_in_part){
4106  //if a cut on the right has the expected weight, then we found
4107  //our cut position. Set up and low coordiantes to this new cut coordinate.
4108  //but we need one more iteration to finalize the cut position,
4109  //as wee need to update the part ids.
4110  if(p_weight == expected_weight_in_part){
4111  current_cut_upper_bounds[i] = current_cut_coordinates[ii];
4112  current_cut_upper_weights[i] = p_weight;
4113  current_cut_lower_bounds[i] = current_cut_coordinates[ii];
4114  current_cut_lower_bound_weights[i] = p_weight;
4115  } else if (p_weight < current_cut_upper_weights[i]){
4116  //if a part weight is larger then my expected weight,
4117  //but lower than my upper bound weight, update upper bound.
4118  current_cut_upper_bounds[i] = current_global_left_closest_points[ii];
4119  current_cut_upper_weights[i] = p_weight;
4120  }
4121  break;
4122  }
4123  //if comes here then pw < ew
4124  //then compare the weight against line weight.
4125  if(line_weight >= expected_weight_in_part){
4126  //if the line is larger than the expected weight,
4127  //then we need to reach to the balance by distributing coordinates on this line.
4128  current_cut_upper_bounds[i] = current_cut_coordinates[ii];
4129  current_cut_upper_weights[i] = line_weight;
4130  current_cut_lower_bounds[i] = current_cut_coordinates[ii];
4131  current_cut_lower_bound_weights[i] = p_weight;
4132  break;
4133  }
4134  //if a stricter lower bound is found,
4135  //update the lower bound.
4136  if (p_weight <= expected_weight_in_part && p_weight >= current_cut_lower_bound_weights[i]){
4137  current_cut_lower_bounds[i] = current_global_right_closest_points[ii] ;
4138  current_cut_lower_bound_weights[i] = p_weight;
4139  }
4140  }
4141 
4142 
4143  mj_scalar_t new_cut_position = 0;
4144  this->mj_calculate_new_cut_position(
4145  current_cut_upper_bounds[i],
4146  current_cut_lower_bounds[i],
4147  current_cut_upper_weights[i],
4148  current_cut_lower_bound_weights[i],
4149  expected_weight_in_part, new_cut_position);
4150 
4151  //if cut line does not move significantly.
4152  //then finalize the search.
4153  if (ZOLTAN2_ABS(current_cut_coordinates[i] - new_cut_position) < this->sEpsilon
4154  /*|| current_cut_lower_bounds[i] - current_cut_upper_bounds[i] > this->sEpsilon*/
4155  ){
4156  current_cut_line_determined[i] = true;
4157 #ifdef HAVE_ZOLTAN2_OMP
4158 #pragma omp atomic
4159 #endif
4160  my_num_incomplete_cut -= 1;
4161 
4162  //set the cut coordinate and proceed.
4163  new_current_cut_coordinates [i] = current_cut_coordinates[i];
4164  } else {
4165  new_current_cut_coordinates [i] = new_cut_position;
4166  }
4167  } else {
4168 
4169  //need to move the cut line to left.
4170  //set upper bound to current line.
4171  current_cut_upper_bounds[i] = current_global_left_closest_points[i];
4172  current_cut_upper_weights[i] = seen_weight_in_part;
4173 
4174  // compare the current cut line weights with previous upper and lower bounds.
4175  for (int ii = i - 1; ii >= 0; --ii){
4176  mj_scalar_t p_weight = current_global_part_weights[ii * 2];
4177  mj_scalar_t line_weight = current_global_part_weights[ii * 2 + 1];
4178  if(p_weight <= expected_weight_in_part){
4179  if(p_weight == expected_weight_in_part){
4180  //if the weight of the part is my expected weight
4181  //then we find the solution.
4182  current_cut_upper_bounds[i] = current_cut_coordinates[ii];
4183  current_cut_upper_weights[i] = p_weight;
4184  current_cut_lower_bounds[i] = current_cut_coordinates[ii];
4185  current_cut_lower_bound_weights[i] = p_weight;
4186  }
4187  else if (p_weight > current_cut_lower_bound_weights[i]){
4188  //if found weight is bigger than the lower bound
4189  //then update the lower bound.
4190  current_cut_lower_bounds[i] = current_global_right_closest_points[ii];
4191  current_cut_lower_bound_weights[i] = p_weight;
4192 
4193  //at the same time, if weight of line is bigger than the
4194  //expected weight, then update the upper bound as well.
4195  //in this case the balance will be obtained by distributing weightss
4196  //on this cut position.
4197  if(line_weight > expected_weight_in_part){
4198  current_cut_upper_bounds[i] = current_global_right_closest_points[ii];
4199  current_cut_upper_weights[i] = line_weight;
4200  }
4201  }
4202  break;
4203  }
4204  //if the weight of the cut on the left is still bigger than my weight,
4205  //and also if the weight is smaller than the current upper weight,
4206  //or if the weight is equal to current upper weight, but on the left of
4207  // the upper weight, then update upper bound.
4208  if (p_weight >= expected_weight_in_part &&
4209  (p_weight < current_cut_upper_weights[i] ||
4210  (p_weight == current_cut_upper_weights[i] &&
4211  current_cut_upper_bounds[i] > current_global_left_closest_points[ii]
4212  )
4213  )
4214  ){
4215  current_cut_upper_bounds[i] = current_global_left_closest_points[ii] ;
4216  current_cut_upper_weights[i] = p_weight;
4217  }
4218  }
4219  mj_scalar_t new_cut_position = 0;
4220  this->mj_calculate_new_cut_position(
4221  current_cut_upper_bounds[i],
4222  current_cut_lower_bounds[i],
4223  current_cut_upper_weights[i],
4224  current_cut_lower_bound_weights[i],
4225  expected_weight_in_part,
4226  new_cut_position);
4227 
4228  //if cut line does not move significantly.
4229  if (ZOLTAN2_ABS(current_cut_coordinates[i] - new_cut_position) < this->sEpsilon
4230  /*|| current_cut_lower_bounds[i] - current_cut_upper_bounds[i] > this->sEpsilon*/ ){
4231  current_cut_line_determined[i] = true;
4232 #ifdef HAVE_ZOLTAN2_OMP
4233 #pragma omp atomic
4234 #endif
4235  my_num_incomplete_cut -= 1;
4236  //set the cut coordinate and proceed.
4237  new_current_cut_coordinates [ i] = current_cut_coordinates[i];
4238  } else {
4239  new_current_cut_coordinates [ i] = new_cut_position;
4240  }
4241  }
4242  }
4243 
4244  { // This unnecessary bracket works around a compiler bug in NVCC when enabling OpenMP as well
4245 
4246  //communication to determine the ratios of processors for the distribution
4247  //of coordinates on the cut lines.
4248 #ifdef HAVE_ZOLTAN2_OMP
4249  //no need barrier here as it is implicit.
4250 #pragma omp single
4251 #endif
4252  {
4253  if(*rectilinear_cut_count > 0){
4254 
4255  try{
4256  Teuchos::scan<int,mj_scalar_t>(
4257  *comm, Teuchos::REDUCE_SUM,
4258  num_cuts,
4259  this->process_rectilinear_cut_weight,
4260  this->global_rectilinear_cut_weight
4261  );
4262  }
4263  Z2_THROW_OUTSIDE_ERROR(*(this->mj_env))
4264 
4265  for (mj_part_t i = 0; i < num_cuts; ++i){
4266  //if cut line weight to be distributed.
4267  if(this->global_rectilinear_cut_weight[i] > 0) {
4268  //expected weight to go to left of the cut.
4269  mj_scalar_t expected_part_weight = current_part_target_weights[i];
4270  //the weight that should be put to left of the cut.
4271  mj_scalar_t necessary_weight_on_line_for_left = expected_part_weight - current_global_part_weights[i * 2];
4272  //the weight of the cut in the process
4273  mj_scalar_t my_weight_on_line = this->process_rectilinear_cut_weight[i];
4274  //the sum of the cut weights upto this process, including the weight of this process.
4275  mj_scalar_t weight_on_line_upto_process_inclusive = this->global_rectilinear_cut_weight[i];
4276  //the space on the left side of the cut after all processes before this process (including this process)
4277  //puts their weights on cut to left.
4278  mj_scalar_t space_to_put_left = necessary_weight_on_line_for_left - weight_on_line_upto_process_inclusive;
4279  //add my weight to this space to find out how much space is left to me.
4280  mj_scalar_t space_left_to_me = space_to_put_left + my_weight_on_line;
4281 
4282  /*
4283  std::cout << "expected_part_weight:" << expected_part_weight
4284  << " necessary_weight_on_line_for_left:" << necessary_weight_on_line_for_left
4285  << " my_weight_on_line" << my_weight_on_line
4286  << " weight_on_line_upto_process_inclusive:" << weight_on_line_upto_process_inclusive
4287  << " space_to_put_left:" << space_to_put_left
4288  << " space_left_to_me" << space_left_to_me << std::endl;
4289  */
4290  if(space_left_to_me < 0){
4291  //space_left_to_me is negative and i dont need to put anything to left.
4292  current_part_cut_line_weight_to_put_left[i] = 0;
4293  }
4294  else if(space_left_to_me >= my_weight_on_line){
4295  //space left to me is bigger than the weight of the processor on cut.
4296  //so put everything to left.
4297  current_part_cut_line_weight_to_put_left[i] = my_weight_on_line;
4298  //std::cout << "setting current_part_cut_line_weight_to_put_left to my_weight_on_line:" << my_weight_on_line << std::endl;
4299  }
4300  else {
4301  //put only the weight as much as the space.
4302  current_part_cut_line_weight_to_put_left[i] = space_left_to_me ;
4303 
4304  //std::cout << "setting current_part_cut_line_weight_to_put_left to space_left_to_me:" << space_left_to_me << std::endl;
4305  }
4306 
4307  }
4308  }
4309  *rectilinear_cut_count = 0;
4310  }
4311  }
4312  }
4313 }
4314 
4324 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4325  typename mj_part_t>
4326 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::get_processor_num_points_in_parts(
4327  mj_part_t num_procs,
4328  mj_part_t num_parts,
4329  mj_gno_t *&num_points_in_all_processor_parts){
4330 
4331  //initially allocation_size is num_parts
4332  size_t allocation_size = num_parts * (num_procs + 1);
4333 
4334  //this will be output
4335  //holds how many each processor has in each part.
4336  //last portion is the sum of all processor points in each part.
4337 
4338  //allocate memory for the local num coordinates in each part.
4339  mj_gno_t *num_local_points_in_each_part_to_reduce_sum = allocMemory<mj_gno_t>(allocation_size);
4340 
4341 
4342  //this is the portion of the memory which will be used
4343  //at the summation to obtain total number of processors' points in each part.
4344  mj_gno_t *my_local_points_to_reduce_sum = num_local_points_in_each_part_to_reduce_sum + num_procs * num_parts;
4345  //this is the portion of the memory where each stores its local number.
4346  //this information is needed by other processors.
4347  mj_gno_t *my_local_point_counts_in_each_art = num_local_points_in_each_part_to_reduce_sum + this->myRank * num_parts;
4348 
4349  //initialize the array with 0's.
4350  memset(num_local_points_in_each_part_to_reduce_sum, 0, sizeof(mj_gno_t)*allocation_size);
4351 
4352  //write the number of coordinates in each part.
4353  for (mj_part_t i = 0; i < num_parts; ++i){
4354  mj_lno_t part_begin_index = 0;
4355  if (i > 0){
4356  part_begin_index = this->new_part_xadj[i - 1];
4357  }
4358  mj_lno_t part_end_index = this->new_part_xadj[i];
4359  my_local_points_to_reduce_sum[i] = part_end_index - part_begin_index;
4360  }
4361 
4362  //copy the local num parts to the last portion of array,
4363  //so that this portion will represent the global num points in each part after the reduction.
4364  memcpy (my_local_point_counts_in_each_art,
4365  my_local_points_to_reduce_sum,
4366  sizeof(mj_gno_t) * (num_parts) );
4367 
4368 
4369  //reduceAll operation.
4370  //the portion that belongs to a processor with index p
4371  //will start from myRank * num_parts.
4372  //the global number of points will be held at the index
4373  try{
4374  reduceAll<int, mj_gno_t>(
4375  *(this->comm),
4376  Teuchos::REDUCE_SUM,
4377  allocation_size,
4378  num_local_points_in_each_part_to_reduce_sum,
4379  num_points_in_all_processor_parts);
4380  }
4381  Z2_THROW_OUTSIDE_ERROR(*(this->mj_env))
4382  freeArray<mj_gno_t>(num_local_points_in_each_part_to_reduce_sum);
4383 }
4384 
4385 
4386 
4399 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4400  typename mj_part_t>
4401 bool AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_check_to_migrate(
4402  size_t migration_reduce_all_population,
4403  mj_lno_t num_coords_for_last_dim_part,
4404  mj_part_t num_procs,
4405  mj_part_t num_parts,
4406  mj_gno_t *num_points_in_all_processor_parts){
4407 
4408  //if reduce all count and population in the last dim is too high
4409  if (migration_reduce_all_population > FUTURE_REDUCEALL_CUTOFF) return true;
4410  //if the work in a part per processor in the last dim is too low.
4411  if (num_coords_for_last_dim_part < MIN_WORK_LAST_DIM) return true;
4412 
4413  //if migration is to be checked and the imbalance is too high
4414  if (this->check_migrate_avoid_migration_option == 0){
4415  double global_imbalance = 0;
4416  //global shift to reach the sum of coordiante count in each part.
4417  size_t global_shift = num_procs * num_parts;
4418 
4419  for (mj_part_t ii = 0; ii < num_procs; ++ii){
4420  for (mj_part_t i = 0; i < num_parts; ++i){
4421  double ideal_num = num_points_in_all_processor_parts[global_shift + i]
4422  / double(num_procs);
4423 
4424  global_imbalance += ZOLTAN2_ABS(ideal_num -
4425  num_points_in_all_processor_parts[ii * num_parts + i]) / (ideal_num);
4426  }
4427  }
4428  global_imbalance /= num_parts;
4429  global_imbalance /= num_procs;
4430 
4431  /*
4432  if (this->myRank == 0) {
4433  std::cout << "imbalance for next iteration:" << global_imbalance << std::endl;
4434  }
4435  */
4436 
4437  if(global_imbalance <= this->minimum_migration_imbalance){
4438  return false;
4439  }
4440  else {
4441  return true;
4442  }
4443  }
4444  else {
4445  //if migration is forced
4446  return true;
4447  }
4448 }
4449 
4450 
4460 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4461  typename mj_part_t>
4462 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::assign_send_destinations(
4463  mj_part_t num_parts,
4464  mj_part_t *part_assignment_proc_begin_indices,
4465  mj_part_t *processor_chains_in_parts,
4466  mj_lno_t *send_count_to_each_proc,
4467  int *coordinate_destinations){
4468 
4469  for (mj_part_t p = 0; p < num_parts; ++p){
4470  mj_lno_t part_begin = 0;
4471  if (p > 0) part_begin = this->new_part_xadj[p - 1];
4472  mj_lno_t part_end = this->new_part_xadj[p];
4473 
4474  //get the first part that current processor will send its part-p.
4475  mj_part_t proc_to_sent = part_assignment_proc_begin_indices[p];
4476  //initialize how many point I sent to this processor.
4477  mj_lno_t num_total_send = 0;
4478  for (mj_lno_t j=part_begin; j < part_end; j++){
4479  mj_lno_t local_ind = this->new_coordinate_permutations[j];
4480  while (num_total_send >= send_count_to_each_proc[proc_to_sent]){
4481  //then get the next processor to send the points in part p.
4482  num_total_send = 0;
4483  //assign new processor to part_assign_begin[p]
4484  part_assignment_proc_begin_indices[p] = processor_chains_in_parts[proc_to_sent];
4485  //remove the previous processor
4486  processor_chains_in_parts[proc_to_sent] = -1;
4487  //choose the next processor as the next one to send.
4488  proc_to_sent = part_assignment_proc_begin_indices[p];
4489  }
4490  //write the gno index to corresponding position in sendBuf.
4491  coordinate_destinations[local_ind] = proc_to_sent;
4492  ++num_total_send;
4493  }
4494  }
4495 }
4496 
4511 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4512  typename mj_part_t>
4513 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_assign_proc_to_parts(
4514  mj_gno_t * num_points_in_all_processor_parts,
4515  mj_part_t num_parts,
4516  mj_part_t num_procs,
4517  mj_lno_t *send_count_to_each_proc,
4518  std::vector<mj_part_t> &processor_ranks_for_subcomm,
4519  std::vector<mj_part_t> *next_future_num_parts_in_parts,
4520  mj_part_t &out_part_index,
4521  mj_part_t &output_part_numbering_begin_index,
4522  int *coordinate_destinations){
4523 
4524 
4525  mj_gno_t *global_num_points_in_parts = num_points_in_all_processor_parts + num_procs * num_parts;
4526  mj_part_t *num_procs_assigned_to_each_part = allocMemory<mj_part_t>(num_parts);
4527 
4528  //boolean variable if the process finds its part to be assigned.
4529  bool did_i_find_my_group = false;
4530 
4531  mj_part_t num_free_procs = num_procs;
4532  mj_part_t minimum_num_procs_required_for_rest_of_parts = num_parts - 1;
4533 
4534  double max_imbalance_difference = 0;
4535  mj_part_t max_differing_part = 0;
4536 
4537  //find how many processor each part requires.
4538  for (mj_part_t i=0; i < num_parts; i++){
4539 
4540  //scalar portion of the required processors
4541  double scalar_required_proc = num_procs *
4542  (double (global_num_points_in_parts[i]) / double (this->num_global_coords));
4543 
4544  //round it to closest integer; make sure have at least one proc.
4545  mj_part_t required_proc = static_cast<mj_part_t> (0.5 + scalar_required_proc);
4546  if (required_proc == 0) required_proc = 1;
4547 
4548  //if assigning the required num procs, creates problems for the rest of the parts.
4549  //then only assign {num_free_procs - (minimum_num_procs_required_for_rest_of_parts)} procs to this part.
4550  if (num_free_procs - required_proc < minimum_num_procs_required_for_rest_of_parts){
4551  required_proc = num_free_procs - (minimum_num_procs_required_for_rest_of_parts);
4552  }
4553 
4554  //reduce the free processor count
4555  num_free_procs -= required_proc;
4556  //reduce the free minimum processor count required for the rest of the part by 1.
4557  --minimum_num_procs_required_for_rest_of_parts;
4558 
4559  //part (i) is assigned to (required_proc) processors.
4560  num_procs_assigned_to_each_part[i] = required_proc;
4561 
4562  //because of the roundings some processors might be left as unassigned.
4563  //we want to assign those processors to the part with most imbalance.
4564  //find the part with the maximum imbalance here.
4565  double imbalance_wrt_ideal = (scalar_required_proc - required_proc) / required_proc;
4566  if (imbalance_wrt_ideal > max_imbalance_difference){
4567  max_imbalance_difference = imbalance_wrt_ideal;
4568  max_differing_part = i;
4569  }
4570  }
4571 
4572  //assign extra processors to the part with maximum imbalance than the ideal.
4573  if (num_free_procs > 0){
4574  num_procs_assigned_to_each_part[max_differing_part] += num_free_procs;
4575  }
4576 
4577  //now find what are the best processors with least migration for each part.
4578 
4579  //part_assignment_proc_begin_indices ([i]) is the array that holds the beginning
4580  //index of a processor that processor sends its data for part - i
4581  mj_part_t *part_assignment_proc_begin_indices = allocMemory<mj_part_t>(num_parts);
4582  //the next processor send is found in processor_chains_in_parts, in linked list manner.
4583  mj_part_t *processor_chains_in_parts = allocMemory<mj_part_t>(num_procs);
4584  mj_part_t *processor_part_assignments = allocMemory<mj_part_t>(num_procs);
4585 
4586  //initialize the assignment of each processor.
4587  //this has a linked list implementation.
4588  //the beginning of processors assigned
4589  //to each part is hold at part_assignment_proc_begin_indices[part].
4590  //then the next processor assigned to that part is located at
4591  //proc_part_assignments[part_assign_begins[part]], this is a chain
4592  //until the value of -1 is reached.
4593  for (int i = 0; i < num_procs; ++i ){
4594  processor_part_assignments[i] = -1;
4595  processor_chains_in_parts[i] = -1;
4596  }
4597  for (int i = 0; i < num_parts; ++i ){
4598  part_assignment_proc_begin_indices[i] = -1;
4599  }
4600 
4601 
4602  //std::cout << "Before migration: mig type:" << this->migration_type << std::endl;
4603  //Allocate memory for sorting data structure.
4604  uSignedSortItem<mj_part_t, mj_gno_t, char> * sort_item_num_part_points_in_procs = allocMemory <uSignedSortItem<mj_part_t, mj_gno_t, char> > (num_procs);
4605  for(mj_part_t i = 0; i < num_parts; ++i){
4606  //the algorithm tries to minimize the cost of migration,
4607  //by assigning the processors with highest number of coordinates on that part.
4608  //here we might want to implement a maximum weighted bipartite matching algorithm.
4609  for(mj_part_t ii = 0; ii < num_procs; ++ii){
4610  sort_item_num_part_points_in_procs[ii].id = ii;
4611  //if processor is not assigned yet.
4612  //add its num points to the sort data structure.
4613  if (processor_part_assignments[ii] == -1){
4614  sort_item_num_part_points_in_procs[ii].val = num_points_in_all_processor_parts[ii * num_parts + i];
4615  sort_item_num_part_points_in_procs[ii].signbit = 1; //indicate that the processor has positive weight.
4616  }
4617  else {
4618  //if processor is already assigned, insert -nLocal - 1 so that it won't be selected again.
4619  //would be same if we simply set it to -1,
4620  //but more information with no extra cost (which is used later) is provided.
4621  //sort_item_num_part_points_in_procs[ii].val = -num_points_in_all_processor_parts[ii * num_parts + i] - 1;
4622 
4623  //UPDATE: Since above gets warning when unsigned is used to represent, we added extra bit to as sign bit to the sort item.
4624  //It is 1 for positives, 0 for negatives.
4625  sort_item_num_part_points_in_procs[ii].val = num_points_in_all_processor_parts[ii * num_parts + i];
4626  sort_item_num_part_points_in_procs[ii].signbit = 0;
4627  }
4628  }
4629  //sort the processors in the part.
4630  uqSignsort<mj_part_t, mj_gno_t,char>(num_procs, sort_item_num_part_points_in_procs);
4631 
4632  /*
4633  for(mj_part_t ii = 0; ii < num_procs; ++ii){
4634  std::cout << "ii:" << ii << " " << sort_item_num_part_points_in_procs[ii].id <<
4635  " " << sort_item_num_part_points_in_procs[ii].val <<
4636  " " << int(sort_item_num_part_points_in_procs[ii].signbit) << std::endl;
4637  }
4638  */
4639 
4640  mj_part_t required_proc_count = num_procs_assigned_to_each_part[i];
4641  mj_gno_t total_num_points_in_part = global_num_points_in_parts[i];
4642  mj_gno_t ideal_num_points_in_a_proc =
4643  Teuchos::as<mj_gno_t>(ceil (total_num_points_in_part / double (required_proc_count)));
4644 
4645  //starts sending to least heaviest part.
4646  mj_part_t next_proc_to_send_index = num_procs - required_proc_count;
4647  mj_part_t next_proc_to_send_id = sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
4648  mj_lno_t space_left_in_sent_proc = ideal_num_points_in_a_proc - sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
4649 
4650  //find the processors that will be assigned to this part, which are the heaviest
4651  //non assigned processors.
4652  for(mj_part_t ii = num_procs - 1; ii >= num_procs - required_proc_count; --ii){
4653  mj_part_t proc_id = sort_item_num_part_points_in_procs[ii].id;
4654  //assign processor to part - i.
4655  processor_part_assignments[proc_id] = i;
4656  }
4657 
4658  bool did_change_sign = false;
4659  //if processor has a minus count, reverse it.
4660  for(mj_part_t ii = 0; ii < num_procs; ++ii){
4661  // TODO: THE LINE BELOW PRODUCES A WARNING IF gno_t IS UNSIGNED
4662  // TODO: SEE BUG 6194
4663  if (sort_item_num_part_points_in_procs[ii].signbit == 0){
4664  did_change_sign = true;
4665  sort_item_num_part_points_in_procs[ii].signbit = 1;
4666  }
4667  else {
4668  break;
4669  }
4670  }
4671  if(did_change_sign){
4672  //resort the processors in the part for the rest of the processors that is not assigned.
4673  uqSignsort<mj_part_t, mj_gno_t>(num_procs - required_proc_count, sort_item_num_part_points_in_procs);
4674  }
4675  /*
4676  for(mj_part_t ii = 0; ii < num_procs; ++ii){
4677  std::cout << "after resort ii:" << ii << " " << sort_item_num_part_points_in_procs[ii].id <<
4678  " " << sort_item_num_part_points_in_procs[ii].val <<
4679  " " << int(sort_item_num_part_points_in_procs[ii].signbit ) << std::endl;
4680  }
4681  */
4682 
4683  //check if this processors is one of the procs assigned to this part.
4684  //if it is, then get the group.
4685  if (!did_i_find_my_group){
4686  for(mj_part_t ii = num_procs - 1; ii >= num_procs - required_proc_count; --ii){
4687 
4688  mj_part_t proc_id_to_assign = sort_item_num_part_points_in_procs[ii].id;
4689  //add the proc to the group.
4690  processor_ranks_for_subcomm.push_back(proc_id_to_assign);
4691 
4692  if(proc_id_to_assign == this->myRank){
4693  //if the assigned process is me, then I find my group.
4694  did_i_find_my_group = true;
4695  //set the beginning of part i to my rank.
4696  part_assignment_proc_begin_indices[i] = this->myRank;
4697  processor_chains_in_parts[this->myRank] = -1;
4698 
4699  //set send count to myself to the number of points that I have in part i.
4700  send_count_to_each_proc[this->myRank] = sort_item_num_part_points_in_procs[ii].val;
4701 
4702  //calculate the shift required for the output_part_numbering_begin_index
4703  for (mj_part_t in = 0; in < i; ++in){
4704  output_part_numbering_begin_index += (*next_future_num_parts_in_parts)[in];
4705  }
4706  out_part_index = i;
4707  }
4708  }
4709  //if these was not my group,
4710  //clear the subcomminicator processor array.
4711  if (!did_i_find_my_group){
4712  processor_ranks_for_subcomm.clear();
4713  }
4714  }
4715 
4716  //send points of the nonassigned coordinates to the assigned coordinates.
4717  //starts from the heaviest nonassigned processor.
4718  //TODO we might want to play with this part, that allows more computational imbalance
4719  //but having better communication balance.
4720  for(mj_part_t ii = num_procs - required_proc_count - 1; ii >= 0; --ii){
4721  mj_part_t nonassigned_proc_id = sort_item_num_part_points_in_procs[ii].id;
4722  mj_lno_t num_points_to_sent = sort_item_num_part_points_in_procs[ii].val;
4723 
4724  //we set number of points to -to_sent - 1 for the assigned processors.
4725  //we reverse it here. This should not happen, as we have already reversed them above.
4726 #ifdef MJ_DEBUG
4727  if (num_points_to_sent < 0) {
4728  std::cout << "Migration - processor assignments - for part:" << i << "from proc:" << nonassigned_proc_id << " num_points_to_sent:" << num_points_to_sent << std::endl;
4729  exit(1);
4730  }
4731 #endif
4732 
4733  switch (migration_type){
4734  case 0:
4735  {
4736  //now sends the points to the assigned processors.
4737  while (num_points_to_sent > 0){
4738  //if the processor has enough space.
4739  if (num_points_to_sent <= space_left_in_sent_proc){
4740  //reduce the space left in the processor.
4741  space_left_in_sent_proc -= num_points_to_sent;
4742  //if my rank is the one that is sending the coordinates.
4743  if (this->myRank == nonassigned_proc_id){
4744  //set my sent count to the sent processor.
4745  send_count_to_each_proc[next_proc_to_send_id] = num_points_to_sent;
4746  //save the processor in the list (processor_chains_in_parts and part_assignment_proc_begin_indices)
4747  //that the processor will send its point in part-i.
4748  mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
4749  part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
4750  processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
4751  }
4752  num_points_to_sent = 0;
4753  }
4754  else {
4755  //there might be no space left in the processor.
4756  if(space_left_in_sent_proc > 0){
4757  num_points_to_sent -= space_left_in_sent_proc;
4758 
4759  //send as the space left in the processor.
4760  if (this->myRank == nonassigned_proc_id){
4761  //send as much as the space in this case.
4762  send_count_to_each_proc[next_proc_to_send_id] = space_left_in_sent_proc;
4763  mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
4764  part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
4765  processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
4766 
4767  }
4768  }
4769  //change the sent part
4770  ++next_proc_to_send_index;
4771 
4772 #ifdef MJ_DEBUG
4773  if(next_part_to_send_index < nprocs - required_proc_count ){
4774  std::cout << "Migration - processor assignments - for part:"
4775  << i
4776  << " next_part_to_send :" << next_part_to_send_index
4777  << " nprocs:" << nprocs
4778  << " required_proc_count:" << required_proc_count
4779  << " Error: next_part_to_send_index < nprocs - required_proc_count" << std::endl;
4780  exit(1)l
4781 
4782  }
4783 #endif
4784  //send the new id.
4785  next_proc_to_send_id = sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
4786  //set the new space in the processor.
4787  space_left_in_sent_proc = ideal_num_points_in_a_proc - sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
4788  }
4789  }
4790  }
4791  break;
4792  default:
4793  {
4794  //to minimize messages, we want each processor to send its coordinates to only a single point.
4795  //we do not respect imbalances here, we send all points to the next processor.
4796  if (this->myRank == nonassigned_proc_id){
4797  //set my sent count to the sent processor.
4798  send_count_to_each_proc[next_proc_to_send_id] = num_points_to_sent;
4799  //save the processor in the list (processor_chains_in_parts and part_assignment_proc_begin_indices)
4800  //that the processor will send its point in part-i.
4801  mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
4802  part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
4803  processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
4804  }
4805  num_points_to_sent = 0;
4806  ++next_proc_to_send_index;
4807 
4808  //if we made it to the heaviest processor we round robin and go to beginning
4809  if (next_proc_to_send_index == num_procs){
4810  next_proc_to_send_index = num_procs - required_proc_count;
4811  }
4812  //send the new id.
4813  next_proc_to_send_id = sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
4814  //set the new space in the processor.
4815  space_left_in_sent_proc = ideal_num_points_in_a_proc - sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
4816  }
4817  }
4818  }
4819  }
4820 
4821  /*
4822  for (int i = 0; i < num_procs;++i){
4823  std::cout << "me:" << this->myRank << " to part:" << i << " sends:" << send_count_to_each_proc[i] << std::endl;
4824  }
4825  */
4826 
4827 
4828  this->assign_send_destinations(
4829  num_parts,
4830  part_assignment_proc_begin_indices,
4831  processor_chains_in_parts,
4832  send_count_to_each_proc,
4833  coordinate_destinations);
4834 
4835  freeArray<mj_part_t>(part_assignment_proc_begin_indices);
4836  freeArray<mj_part_t>(processor_chains_in_parts);
4837  freeArray<mj_part_t>(processor_part_assignments);
4838  freeArray<uSignedSortItem<mj_part_t, mj_gno_t, char> > (sort_item_num_part_points_in_procs);
4839  freeArray<mj_part_t > (num_procs_assigned_to_each_part);
4840 
4841 }
4842 
4843 
4856 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4857  typename mj_part_t>
4858 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::assign_send_destinations2(
4859  mj_part_t num_parts,
4860  uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment, //input sorted wrt processors
4861  int *coordinate_destinations,
4862  mj_part_t &output_part_numbering_begin_index,
4863  std::vector<mj_part_t> *next_future_num_parts_in_parts){
4864 
4865  mj_part_t part_shift_amount = output_part_numbering_begin_index;
4866  mj_part_t previous_processor = -1;
4867  for(mj_part_t i = 0; i < num_parts; ++i){
4868  mj_part_t p = sort_item_part_to_proc_assignment[i].id;
4869  //assigned processors are sorted.
4870  mj_lno_t part_begin_index = 0;
4871  if (p > 0) part_begin_index = this->new_part_xadj[p - 1];
4872  mj_lno_t part_end_index = this->new_part_xadj[p];
4873 
4874  mj_part_t assigned_proc = sort_item_part_to_proc_assignment[i].val;
4875  if (this->myRank == assigned_proc && previous_processor != assigned_proc){
4876  output_part_numbering_begin_index = part_shift_amount;
4877  }
4878  previous_processor = assigned_proc;
4879  part_shift_amount += (*next_future_num_parts_in_parts)[p];
4880 
4881  for (mj_lno_t j=part_begin_index; j < part_end_index; j++){
4882  mj_lno_t localInd = this->new_coordinate_permutations[j];
4883  coordinate_destinations[localInd] = assigned_proc;
4884  }
4885  }
4886 }
4887 
4888 
4905 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4906  typename mj_part_t>
4907 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_assign_parts_to_procs(
4908  mj_gno_t * num_points_in_all_processor_parts,
4909  mj_part_t num_parts,
4910  mj_part_t num_procs,
4911  mj_lno_t *send_count_to_each_proc, //output: sized nprocs, show the number of send point counts to each proc.
4912  std::vector<mj_part_t> *next_future_num_parts_in_parts,//input how many more partitions the part will be partitioned into.
4913  mj_part_t &out_num_part, //output, how many parts the processor will have. this is always 1 for this function.
4914  std::vector<mj_part_t> &out_part_indices, //output: the part indices which the processor is assigned to.
4915  mj_part_t &output_part_numbering_begin_index, //output: how much the part number should be shifted when setting the solution
4916  int *coordinate_destinations){
4917  out_num_part = 0;
4918 
4919  mj_gno_t *global_num_points_in_parts = num_points_in_all_processor_parts + num_procs * num_parts;
4920  out_part_indices.clear();
4921 
4922  //to sort the parts that is assigned to the processors.
4923  //id is the part number, sort value is the assigned processor id.
4924  uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment = allocMemory <uSortItem<mj_part_t, mj_part_t> >(num_parts);
4925  uSortItem<mj_part_t, mj_gno_t> * sort_item_num_points_of_proc_in_part_i = allocMemory <uSortItem<mj_part_t, mj_gno_t> >(num_procs);
4926 
4927 
4928  //calculate the optimal number of coordinates that should be assigned to each processor.
4929  mj_lno_t work_each = mj_lno_t (this->num_global_coords / (double (num_procs)) + 0.5f);
4930  //to hold the left space as the number of coordinates to the optimal number in each proc.
4931  mj_lno_t *space_in_each_processor = allocMemory <mj_lno_t>(num_procs);
4932  //initialize left space in each.
4933  for (mj_part_t i = 0; i < num_procs; ++i){
4934  space_in_each_processor[i] = work_each;
4935  }
4936 
4937  //we keep track of how many parts each processor is assigned to.
4938  //because in some weird inputs, it might be possible that some
4939  //processors is not assigned to any part. Using these variables,
4940  //we force each processor to have at least one part.
4941  mj_part_t *num_parts_proc_assigned = allocMemory <mj_part_t>(num_procs);
4942  memset(num_parts_proc_assigned, 0, sizeof(mj_part_t) * num_procs);
4943  int empty_proc_count = num_procs;
4944 
4945  //to sort the parts with decreasing order of their coordiantes.
4946  //id are the part numbers, sort value is the number of points in each.
4947  uSortItem<mj_part_t, mj_gno_t> * sort_item_point_counts_in_parts = allocMemory <uSortItem<mj_part_t, mj_gno_t> >(num_parts);
4948 
4949  //initially we will sort the parts according to the number of coordinates they have.
4950  //so that we will start assigning with the part that has the most number of coordinates.
4951  for (mj_part_t i = 0; i < num_parts; ++i){
4952  sort_item_point_counts_in_parts[i].id = i;
4953  sort_item_point_counts_in_parts[i].val = global_num_points_in_parts[i];
4954  }
4955  //sort parts with increasing order of loads.
4956  uqsort<mj_part_t, mj_gno_t>(num_parts, sort_item_point_counts_in_parts);
4957 
4958 
4959  //assigning parts to the processors
4960  //traverse the part win decreasing order of load.
4961  //first assign the heaviest part.
4962  for (mj_part_t j = 0; j < num_parts; ++j){
4963  //sorted with increasing order, traverse inverse.
4964  mj_part_t i = sort_item_point_counts_in_parts[num_parts - 1 - j].id;
4965  //load of the part
4966  mj_gno_t load = global_num_points_in_parts[i];
4967 
4968  //assigned processors
4969  mj_part_t assigned_proc = -1;
4970  //if not fit best processor.
4971  mj_part_t best_proc_to_assign = 0;
4972 
4973 
4974  //sort processors with increasing number of points in this part.
4975  for (mj_part_t ii = 0; ii < num_procs; ++ii){
4976  sort_item_num_points_of_proc_in_part_i[ii].id = ii;
4977 
4978  //if there are still enough parts to fill empty processors, than proceed normally.
4979  //but if empty processor count is equal to the number of part, then
4980  //we force to part assignments only to empty processors.
4981  if (empty_proc_count < num_parts - j || num_parts_proc_assigned[ii] == 0){
4982  //how many points processor ii has in part i?
4983  sort_item_num_points_of_proc_in_part_i[ii].val = num_points_in_all_processor_parts[ii * num_parts + i];
4984  }
4985  else {
4986  sort_item_num_points_of_proc_in_part_i[ii].val = -1;
4987  }
4988  }
4989  uqsort<mj_part_t, mj_gno_t>(num_procs, sort_item_num_points_of_proc_in_part_i);
4990 
4991  //traverse all processors with decreasing load.
4992  for (mj_part_t iii = num_procs - 1; iii >= 0; --iii){
4993  mj_part_t ii = sort_item_num_points_of_proc_in_part_i[iii].id;
4994  mj_lno_t left_space = space_in_each_processor[ii] - load;
4995  //if enought space, assign to this part.
4996  if(left_space >= 0 ){
4997  assigned_proc = ii;
4998  break;
4999  }
5000  //if space is not enough, store the best candidate part.
5001  if (space_in_each_processor[best_proc_to_assign] < space_in_each_processor[ii]){
5002  best_proc_to_assign = ii;
5003  }
5004  }
5005 
5006  //if none had enough space, then assign it to best part.
5007  if (assigned_proc == -1){
5008  assigned_proc = best_proc_to_assign;
5009  }
5010 
5011  if (num_parts_proc_assigned[assigned_proc]++ == 0){
5012  --empty_proc_count;
5013  }
5014  space_in_each_processor[assigned_proc] -= load;
5015  //to sort later, part-i is assigned to the proccessor - assignment.
5016  sort_item_part_to_proc_assignment[j].id = i; //part i
5017  sort_item_part_to_proc_assignment[j].val = assigned_proc; //assigned to processor - assignment.
5018 
5019 
5020  //if assigned processor is me, increase the number.
5021  if (assigned_proc == this->myRank){
5022  out_num_part++;//assigned_part_count;
5023  out_part_indices.push_back(i);
5024  }
5025  //increase the send to that processor by the number of points in that part.
5026  //as everyone send their coordiantes in this part to the processor assigned to this part.
5027  send_count_to_each_proc[assigned_proc] += num_points_in_all_processor_parts[this->myRank * num_parts + i];
5028  }
5029  freeArray<mj_part_t>(num_parts_proc_assigned);
5030  freeArray< uSortItem<mj_part_t, mj_gno_t> > (sort_item_num_points_of_proc_in_part_i);
5031  freeArray<uSortItem<mj_part_t, mj_gno_t> >(sort_item_point_counts_in_parts);
5032  freeArray<mj_lno_t >(space_in_each_processor);
5033 
5034 
5035  //sort assignments with respect to the assigned processors.
5036  uqsort<mj_part_t, mj_part_t>(num_parts, sort_item_part_to_proc_assignment);
5037  //fill sendBuf.
5038 
5039 
5040  this->assign_send_destinations2(
5041  num_parts,
5042  sort_item_part_to_proc_assignment,
5043  coordinate_destinations,
5044  output_part_numbering_begin_index,
5045  next_future_num_parts_in_parts);
5046 
5047  freeArray<uSortItem<mj_part_t, mj_part_t> >(sort_item_part_to_proc_assignment);
5048 }
5049 
5050 
5068 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5069  typename mj_part_t>
5070 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_migration_part_proc_assignment(
5071  mj_gno_t * num_points_in_all_processor_parts,
5072  mj_part_t num_parts,
5073  mj_part_t num_procs,
5074  mj_lno_t *send_count_to_each_proc,
5075  std::vector<mj_part_t> &processor_ranks_for_subcomm,
5076  std::vector<mj_part_t> *next_future_num_parts_in_parts,
5077  mj_part_t &out_num_part,
5078  std::vector<mj_part_t> &out_part_indices,
5079  mj_part_t &output_part_numbering_begin_index,
5080  int *coordinate_destinations){
5081 
5082 
5083 
5084  processor_ranks_for_subcomm.clear();
5085  // if (this->num_local_coords > 0)
5086  if (num_procs > num_parts){
5087  //if there are more processors than the number of current part
5088  //then processors share the existing parts.
5089  //at the end each processor will have a single part,
5090  //but a part will be shared by a group of processors.
5091  mj_part_t out_part_index = 0;
5092  this->mj_assign_proc_to_parts(
5093  num_points_in_all_processor_parts,
5094  num_parts,
5095  num_procs,
5096  send_count_to_each_proc,
5097  processor_ranks_for_subcomm,
5098  next_future_num_parts_in_parts,
5099  out_part_index,
5100  output_part_numbering_begin_index,
5101  coordinate_destinations
5102  );
5103 
5104  out_num_part = 1;
5105  out_part_indices.clear();
5106  out_part_indices.push_back(out_part_index);
5107  }
5108  else {
5109 
5110  //there are more parts than the processors.
5111  //therefore a processor will be assigned multiple parts,
5112  //the subcommunicators will only have a single processor.
5113  processor_ranks_for_subcomm.push_back(this->myRank);
5114 
5115  //since there are more parts then procs,
5116  //assign multiple parts to processors.
5117  this->mj_assign_parts_to_procs(
5118  num_points_in_all_processor_parts,
5119  num_parts,
5120  num_procs,
5121  send_count_to_each_proc,
5122  next_future_num_parts_in_parts,
5123  out_num_part,
5124  out_part_indices,
5125  output_part_numbering_begin_index,
5126  coordinate_destinations);
5127  }
5128 }
5129 
5142 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5143  typename mj_part_t>
5144 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_migrate_coords(
5145  mj_part_t num_procs,
5146  mj_lno_t &num_new_local_points,
5147  std::string iteration,
5148  int *coordinate_destinations,
5149  mj_part_t num_parts)
5150 {
5151 #ifdef ENABLE_ZOLTAN_MIGRATION
5152  if (sizeof(mj_lno_t) <= sizeof(int)) {
5153 
5154  // Cannot use Zoltan_Comm with local ordinals larger than ints.
5155  // In Zoltan_Comm_Create, the cast int(this->num_local_coords)
5156  // may overflow.
5157 
5158  ZOLTAN_COMM_OBJ *plan = NULL;
5159  MPI_Comm mpi_comm = Teuchos::getRawMpiComm(*(this->comm));
5160  int num_incoming_gnos = 0;
5161  int message_tag = 7859;
5162 
5163  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Migration Z1PlanCreating-" + iteration);
5164  int ierr = Zoltan_Comm_Create(
5165  &plan,
5166  int(this->num_local_coords),
5167  coordinate_destinations,
5168  mpi_comm,
5169  message_tag,
5170  &num_incoming_gnos);
5171  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
5172  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Migration Z1PlanCreating-" + iteration);
5173 
5174  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Migration Z1Migration-" + iteration);
5175  mj_gno_t *incoming_gnos = allocMemory< mj_gno_t>(num_incoming_gnos);
5176 
5177  //migrate gnos.
5178  message_tag++;
5179  ierr = Zoltan_Comm_Do(
5180  plan,
5181  message_tag,
5182  (char *) this->current_mj_gnos,
5183  sizeof(mj_gno_t),
5184  (char *) incoming_gnos);
5185  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
5186 
5187  freeArray<mj_gno_t>(this->current_mj_gnos);
5188  this->current_mj_gnos = incoming_gnos;
5189 
5190 
5191  //migrate coordinates
5192  for (int i = 0; i < this->coord_dim; ++i){
5193  message_tag++;
5194  mj_scalar_t *coord = this->mj_coordinates[i];
5195 
5196  this->mj_coordinates[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
5197  ierr = Zoltan_Comm_Do(
5198  plan,
5199  message_tag,
5200  (char *) coord,
5201  sizeof(mj_scalar_t),
5202  (char *) this->mj_coordinates[i]);
5203  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
5204  freeArray<mj_scalar_t>(coord);
5205  }
5206 
5207  //migrate weights.
5208  for (int i = 0; i < this->num_weights_per_coord; ++i){
5209  message_tag++;
5210  mj_scalar_t *weight = this->mj_weights[i];
5211 
5212  this->mj_weights[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
5213  ierr = Zoltan_Comm_Do(
5214  plan,
5215  message_tag,
5216  (char *) weight,
5217  sizeof(mj_scalar_t),
5218  (char *) this->mj_weights[i]);
5219  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
5220  freeArray<mj_scalar_t>(weight);
5221  }
5222 
5223 
5224  //migrate owners.
5225  int *coord_own = allocMemory<int>(num_incoming_gnos);
5226  message_tag++;
5227  ierr = Zoltan_Comm_Do(
5228  plan,
5229  message_tag,
5230  (char *) this->owner_of_coordinate,
5231  sizeof(int), (char *) coord_own);
5232  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
5233  freeArray<int>(this->owner_of_coordinate);
5234  this->owner_of_coordinate = coord_own;
5235 
5236 
5237  //if num procs is less than num parts,
5238  //we need the part assigment arrays as well, since
5239  //there will be multiple parts in processor.
5240  mj_part_t *new_parts = allocMemory<mj_part_t>(num_incoming_gnos);
5241  if(num_procs < num_parts){
5242  message_tag++;
5243  ierr = Zoltan_Comm_Do(
5244  plan,
5245  message_tag,
5246  (char *) this->assigned_part_ids,
5247  sizeof(mj_part_t),
5248  (char *) new_parts);
5249  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
5250  }
5251  freeArray<mj_part_t>(this->assigned_part_ids);
5252  this->assigned_part_ids = new_parts;
5253 
5254  ierr = Zoltan_Comm_Destroy(&plan);
5255  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
5256  num_new_local_points = num_incoming_gnos;
5257  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Migration Z1Migration-" + iteration);
5258  }
5259 
5260  else
5261 
5262 #endif // ENABLE_ZOLTAN_MIGRATION
5263  {
5264  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Migration DistributorPlanCreating-" + iteration);
5265  Tpetra::Distributor distributor(this->comm);
5266  ArrayView<const mj_part_t> destinations( coordinate_destinations, this->num_local_coords);
5267  mj_lno_t num_incoming_gnos = distributor.createFromSends(destinations);
5268  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Migration DistributorPlanCreating-" + iteration);
5269 
5270  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Migration DistributorMigration-" + iteration);
5271  {
5272  //migrate gnos.
5273  ArrayRCP<mj_gno_t> received_gnos(num_incoming_gnos);
5274  ArrayView<mj_gno_t> sent_gnos(this->current_mj_gnos, this->num_local_coords);
5275  distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
5276  freeArray<mj_gno_t>(this->current_mj_gnos);
5277  this->current_mj_gnos = allocMemory<mj_gno_t>(num_incoming_gnos);
5278  memcpy(
5279  this->current_mj_gnos,
5280  received_gnos.getRawPtr(),
5281  num_incoming_gnos * sizeof(mj_gno_t));
5282  }
5283  //migrate coordinates
5284  for (int i = 0; i < this->coord_dim; ++i){
5285 
5286  ArrayView<mj_scalar_t> sent_coord(this->mj_coordinates[i], this->num_local_coords);
5287  ArrayRCP<mj_scalar_t> received_coord(num_incoming_gnos);
5288  distributor.doPostsAndWaits<mj_scalar_t>(sent_coord, 1, received_coord());
5289  freeArray<mj_scalar_t>(this->mj_coordinates[i]);
5290  this->mj_coordinates[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
5291  memcpy(
5292  this->mj_coordinates[i],
5293  received_coord.getRawPtr(),
5294  num_incoming_gnos * sizeof(mj_scalar_t));
5295  }
5296 
5297  //migrate weights.
5298  for (int i = 0; i < this->num_weights_per_coord; ++i){
5299 
5300  ArrayView<mj_scalar_t> sent_weight(this->mj_weights[i], this->num_local_coords);
5301  ArrayRCP<mj_scalar_t> received_weight(num_incoming_gnos);
5302  distributor.doPostsAndWaits<mj_scalar_t>(sent_weight, 1, received_weight());
5303  freeArray<mj_scalar_t>(this->mj_weights[i]);
5304  this->mj_weights[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
5305  memcpy(
5306  this->mj_weights[i],
5307  received_weight.getRawPtr(),
5308  num_incoming_gnos * sizeof(mj_scalar_t));
5309  }
5310 
5311  {
5312  //migrate the owners of the coordinates
5313  ArrayView<int> sent_owners(this->owner_of_coordinate, this->num_local_coords);
5314  ArrayRCP<int> received_owners(num_incoming_gnos);
5315  distributor.doPostsAndWaits<int>(sent_owners, 1, received_owners());
5316  freeArray<int>(this->owner_of_coordinate);
5317  this->owner_of_coordinate = allocMemory<int>(num_incoming_gnos);
5318  memcpy(
5319  this->owner_of_coordinate,
5320  received_owners.getRawPtr(),
5321  num_incoming_gnos * sizeof(int));
5322  }
5323 
5324  //if num procs is less than num parts,
5325  //we need the part assigment arrays as well, since
5326  //there will be multiple parts in processor.
5327  if(num_procs < num_parts){
5328  ArrayView<mj_part_t> sent_partids(this->assigned_part_ids, this->num_local_coords);
5329  ArrayRCP<mj_part_t> received_partids(num_incoming_gnos);
5330  distributor.doPostsAndWaits<mj_part_t>(sent_partids, 1, received_partids());
5331  freeArray<mj_part_t>(this->assigned_part_ids);
5332  this->assigned_part_ids = allocMemory<mj_part_t>(num_incoming_gnos);
5333  memcpy(
5334  this->assigned_part_ids,
5335  received_partids.getRawPtr(),
5336  num_incoming_gnos * sizeof(mj_part_t));
5337  }
5338  else {
5339  mj_part_t *new_parts = allocMemory<int>(num_incoming_gnos);
5340  freeArray<mj_part_t>(this->assigned_part_ids);
5341  this->assigned_part_ids = new_parts;
5342  }
5343  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Migration DistributorMigration-" + iteration);
5344  num_new_local_points = num_incoming_gnos;
5345 
5346  }
5347 }
5348 
5355 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5356  typename mj_part_t>
5357 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::create_sub_communicator(std::vector<mj_part_t> &processor_ranks_for_subcomm){
5358  mj_part_t group_size = processor_ranks_for_subcomm.size();
5359  mj_part_t *ids = allocMemory<mj_part_t>(group_size);
5360  for(mj_part_t i = 0; i < group_size; ++i) {
5361  ids[i] = processor_ranks_for_subcomm[i];
5362  }
5363  ArrayView<const mj_part_t> idView(ids, group_size);
5364  this->comm = this->comm->createSubcommunicator(idView);
5365  freeArray<mj_part_t>(ids);
5366 }
5367 
5368 
5374 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5375  typename mj_part_t>
5376 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::fill_permutation_array(
5377  mj_part_t output_num_parts,
5378  mj_part_t num_parts){
5379  //if there is single output part, then simply fill the permutation array.
5380  if (output_num_parts == 1){
5381  for(mj_lno_t i = 0; i < this->num_local_coords; ++i){
5382  this->new_coordinate_permutations[i] = i;
5383  }
5384  this->new_part_xadj[0] = this->num_local_coords;
5385  }
5386  else {
5387 
5388  //otherwise we need to count how many points are there in each part.
5389  //we allocate here as num_parts, because the sent partids are up to num_parts,
5390  //although there are outout_num_parts different part.
5391  mj_lno_t *num_points_in_parts = allocMemory<mj_lno_t>(num_parts);
5392  //part shift holds the which part number an old part number corresponds to.
5393  mj_part_t *part_shifts = allocMemory<mj_part_t>(num_parts);
5394 
5395  memset(num_points_in_parts, 0, sizeof(mj_lno_t) * num_parts);
5396 
5397  for(mj_lno_t i = 0; i < this->num_local_coords; ++i){
5398  mj_part_t ii = this->assigned_part_ids[i];
5399  ++num_points_in_parts[ii];
5400  }
5401 
5402  //write the end points of the parts.
5403  mj_part_t p = 0;
5404  mj_lno_t prev_index = 0;
5405  for(mj_part_t i = 0; i < num_parts; ++i){
5406  if(num_points_in_parts[i] > 0) {
5407  this->new_part_xadj[p] = prev_index + num_points_in_parts[i];
5408  prev_index += num_points_in_parts[i];
5409  part_shifts[i] = p++;
5410  }
5411  }
5412 
5413  //for the rest of the parts write the end index as end point.
5414  mj_part_t assigned_num_parts = p - 1;
5415  for (;p < num_parts; ++p){
5416  this->new_part_xadj[p] = this->new_part_xadj[assigned_num_parts];
5417  }
5418  for(mj_part_t i = 0; i < output_num_parts; ++i){
5419  num_points_in_parts[i] = this->new_part_xadj[i];
5420  }
5421 
5422  //write the permutation array here.
5423  //get the part of the coordinate i, shift it to obtain the new part number.
5424  //assign it to the end of the new part numbers pointer.
5425  for(mj_lno_t i = this->num_local_coords - 1; i >= 0; --i){
5426  mj_part_t part = part_shifts[mj_part_t(this->assigned_part_ids[i])];
5427  this->new_coordinate_permutations[--num_points_in_parts[part]] = i;
5428  }
5429 
5430  freeArray<mj_lno_t>(num_points_in_parts);
5431  freeArray<mj_part_t>(part_shifts);
5432  }
5433 }
5434 
5435 
5458 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5459  typename mj_part_t>
5460 bool AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_perform_migration(
5461  mj_part_t input_num_parts, //current number of parts
5462  mj_part_t &output_num_parts, //output number of parts.
5463  std::vector<mj_part_t> *next_future_num_parts_in_parts,
5464  mj_part_t &output_part_begin_index,
5465  size_t migration_reduce_all_population,
5466  mj_lno_t num_coords_for_last_dim_part,
5467  std::string iteration,
5468  RCP<mj_partBoxVector_t> &input_part_boxes,
5469  RCP<mj_partBoxVector_t> &output_part_boxes
5470 )
5471 {
5472  mj_part_t num_procs = this->comm->getSize();
5473  this->myRank = this->comm->getRank();
5474 
5475 
5476  //this array holds how many points each processor has in each part.
5477  //to access how many points processor i has on part j,
5478  //num_points_in_all_processor_parts[i * num_parts + j]
5479  mj_gno_t *num_points_in_all_processor_parts = allocMemory<mj_gno_t>(input_num_parts * (num_procs + 1));
5480 
5481  //get the number of coordinates in each part in each processor.
5482  this->get_processor_num_points_in_parts(
5483  num_procs,
5484  input_num_parts,
5485  num_points_in_all_processor_parts);
5486 
5487 
5488  //check if migration will be performed or not.
5489  if (!this->mj_check_to_migrate(
5490  migration_reduce_all_population,
5491  num_coords_for_last_dim_part,
5492  num_procs,
5493  input_num_parts,
5494  num_points_in_all_processor_parts)){
5495  freeArray<mj_gno_t>(num_points_in_all_processor_parts);
5496  return false;
5497  }
5498 
5499 
5500  mj_lno_t *send_count_to_each_proc = NULL;
5501  int *coordinate_destinations = allocMemory<int>(this->num_local_coords);
5502  send_count_to_each_proc = allocMemory<mj_lno_t>(num_procs);
5503  for (int i = 0; i < num_procs; ++i) send_count_to_each_proc[i] = 0;
5504 
5505  std::vector<mj_part_t> processor_ranks_for_subcomm;
5506  std::vector<mj_part_t> out_part_indices;
5507 
5508  //determine which processors are assigned to which parts
5509  this->mj_migration_part_proc_assignment(
5510  num_points_in_all_processor_parts,
5511  input_num_parts,
5512  num_procs,
5513  send_count_to_each_proc,
5514  processor_ranks_for_subcomm,
5515  next_future_num_parts_in_parts,
5516  output_num_parts,
5517  out_part_indices,
5518  output_part_begin_index,
5519  coordinate_destinations);
5520 
5521 
5522 
5523 
5524  freeArray<mj_lno_t>(send_count_to_each_proc);
5525  std::vector <mj_part_t> tmpv;
5526 
5527  std::sort (out_part_indices.begin(), out_part_indices.end());
5528  mj_part_t outP = out_part_indices.size();
5529 
5530  mj_gno_t new_global_num_points = 0;
5531  mj_gno_t *global_num_points_in_parts = num_points_in_all_processor_parts + num_procs * input_num_parts;
5532 
5533  if (this->mj_keep_part_boxes){
5534  input_part_boxes->clear();
5535  }
5536 
5537  //now we calculate the new values for next_future_num_parts_in_parts.
5538  //same for the part boxes.
5539  for (mj_part_t i = 0; i < outP; ++i){
5540  mj_part_t ind = out_part_indices[i];
5541  new_global_num_points += global_num_points_in_parts[ind];
5542  tmpv.push_back((*next_future_num_parts_in_parts)[ind]);
5543  if (this->mj_keep_part_boxes){
5544  input_part_boxes->push_back((*output_part_boxes)[ind]);
5545  }
5546  }
5547  //swap the input and output part boxes.
5548  if (this->mj_keep_part_boxes){
5549  RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
5550  input_part_boxes = output_part_boxes;
5551  output_part_boxes = tmpPartBoxes;
5552  }
5553  next_future_num_parts_in_parts->clear();
5554  for (mj_part_t i = 0; i < outP; ++i){
5555  mj_part_t p = tmpv[i];
5556  next_future_num_parts_in_parts->push_back(p);
5557  }
5558 
5559  freeArray<mj_gno_t>(num_points_in_all_processor_parts);
5560 
5561  mj_lno_t num_new_local_points = 0;
5562 
5563 
5564  //perform the actual migration operation here.
5565  this->mj_migrate_coords(
5566  num_procs,
5567  num_new_local_points,
5568  iteration,
5569  coordinate_destinations,
5570  input_num_parts);
5571 
5572 
5573  freeArray<int>(coordinate_destinations);
5574 
5575  if(this->num_local_coords != num_new_local_points){
5576  freeArray<mj_lno_t>(this->new_coordinate_permutations);
5577  freeArray<mj_lno_t>(this->coordinate_permutations);
5578 
5579  this->new_coordinate_permutations = allocMemory<mj_lno_t>(num_new_local_points);
5580  this->coordinate_permutations = allocMemory<mj_lno_t>(num_new_local_points);
5581  }
5582  this->num_local_coords = num_new_local_points;
5583  this->num_global_coords = new_global_num_points;
5584 
5585 
5586 
5587  //create subcommunicator.
5588  this->create_sub_communicator(processor_ranks_for_subcomm);
5589  processor_ranks_for_subcomm.clear();
5590 
5591  //fill the new permutation arrays.
5592  this->fill_permutation_array(
5593  output_num_parts,
5594  input_num_parts);
5595  return true;
5596 }
5597 
5598 
5612 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5613  typename mj_part_t>
5614 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::create_consistent_chunks(
5615  mj_part_t num_parts,
5616  mj_scalar_t *mj_current_dim_coords,
5617  mj_scalar_t *current_concurrent_cut_coordinate,
5618  mj_lno_t coordinate_begin,
5619  mj_lno_t coordinate_end,
5620  mj_scalar_t *used_local_cut_line_weight_to_left,
5621  mj_lno_t *out_part_xadj,
5622  int coordInd, bool longest_dim_part, uSignedSortItem<int, mj_scalar_t, char> *p_coord_dimension_range_sorted){
5623 
5624  //mj_lno_t numCoordsInPart = coordinateEnd - coordinateBegin;
5625  mj_part_t no_cuts = num_parts - 1;
5626 
5627 
5628 
5629  int me = 0;
5630  mj_lno_t *thread_num_points_in_parts = this->thread_point_counts[me];
5631  mj_scalar_t *my_local_thread_cut_weights_to_put_left = NULL;
5632 
5633 
5634  //now if the rectilinear partitioning is allowed we decide how
5635  //much weight each thread should put to left and right.
5636  if (this->distribute_points_on_cut_lines){
5637 
5638  my_local_thread_cut_weights_to_put_left = this->thread_cut_line_weight_to_put_left[me];
5639  for (mj_part_t i = 0; i < no_cuts; ++i){
5640  //the left to be put on the left of the cut.
5641  mj_scalar_t left_weight = used_local_cut_line_weight_to_left[i];
5642  //std::cout << "i:" << i << " left_weight:" << left_weight << std::endl;
5643  for(int ii = 0; ii < this->num_threads; ++ii){
5644  if(left_weight > this->sEpsilon){
5645  //the weight of thread ii on cut.
5646  mj_scalar_t thread_ii_weight_on_cut = this->thread_part_weight_work[ii][i * 2 + 1] - this->thread_part_weight_work[ii][i * 2 ];
5647  if(thread_ii_weight_on_cut < left_weight){
5648  this->thread_cut_line_weight_to_put_left[ii][i] = thread_ii_weight_on_cut;
5649  }
5650  else {
5651  this->thread_cut_line_weight_to_put_left[ii][i] = left_weight ;
5652  }
5653  left_weight -= thread_ii_weight_on_cut;
5654  }
5655  else {
5656  this->thread_cut_line_weight_to_put_left[ii][i] = 0;
5657  }
5658  }
5659  }
5660 
5661  if(no_cuts > 0){
5662  //this is a special case. If cutlines share the same coordinate, their weights are equal.
5663  //we need to adjust the ratio for that.
5664  for (mj_part_t i = no_cuts - 1; i > 0 ; --i){
5665  if(ZOLTAN2_ABS(current_concurrent_cut_coordinate[i] - current_concurrent_cut_coordinate[i -1]) < this->sEpsilon){
5666  my_local_thread_cut_weights_to_put_left[i] -= my_local_thread_cut_weights_to_put_left[i - 1] ;
5667  }
5668  my_local_thread_cut_weights_to_put_left[i] = static_cast<long long>((my_local_thread_cut_weights_to_put_left[i] + LEAST_SIGNIFICANCE) * SIGNIFICANCE_MUL)
5669  / mj_scalar_t(SIGNIFICANCE_MUL);
5670  }
5671  }
5672  }
5673 
5674  for(mj_part_t ii = 0; ii < num_parts; ++ii){
5675  thread_num_points_in_parts[ii] = 0;
5676  }
5677 
5678  //for this specific case we dont want to distribute the points along the cut position
5679  //randomly, as we need a specific ordering of them. Instead,
5680  //we put the coordinates into a sort item, where we sort those
5681  //using the coordinates of points on other dimensions and the index.
5682 
5683 
5684  //some of the cuts might share the same position.
5685  //in this case, if cut i and cut j share the same position
5686  //cut_map[i] = cut_map[j] = sort item index.
5687  mj_part_t *cut_map = allocMemory<mj_part_t> (no_cuts);
5688 
5689 
5690  typedef uMultiSortItem<mj_lno_t, int, mj_scalar_t> multiSItem;
5691  typedef std::vector< multiSItem > multiSVector;
5692  typedef std::vector<multiSVector> multiS2Vector;
5693 
5694  //to keep track of the memory allocated.
5695  std::vector<mj_scalar_t *>allocated_memory;
5696 
5697  //vector for which the coordinates will be sorted.
5698  multiS2Vector sort_vector_points_on_cut;
5699 
5700  //the number of cuts that have different coordinates.
5701  mj_part_t different_cut_count = 1;
5702  cut_map[0] = 0;
5703 
5704  //now we insert 1 sort vector for all cuts on the different
5705  //positins.if multiple cuts are on the same position, they share sort vectors.
5706  multiSVector tmpMultiSVector;
5707  sort_vector_points_on_cut.push_back(tmpMultiSVector);
5708 
5709  for (mj_part_t i = 1; i < no_cuts ; ++i){
5710  //if cuts share the same cut coordinates
5711  //set the cutmap accordingly.
5712  if(ZOLTAN2_ABS(current_concurrent_cut_coordinate[i] - current_concurrent_cut_coordinate[i -1]) < this->sEpsilon){
5713  cut_map[i] = cut_map[i-1];
5714  }
5715  else {
5716  cut_map[i] = different_cut_count++;
5717  multiSVector tmp2MultiSVector;
5718  sort_vector_points_on_cut.push_back(tmp2MultiSVector);
5719  }
5720  }
5721 
5722 
5723  //now the actual part assigment.
5724  for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
5725 
5726  mj_lno_t i = this->coordinate_permutations[ii];
5727 
5728  mj_part_t pp = this->assigned_part_ids[i];
5729  mj_part_t p = pp / 2;
5730  //if the coordinate is on a cut.
5731  if(pp % 2 == 1 ){
5732  mj_scalar_t *vals = allocMemory<mj_scalar_t>(this->coord_dim -1);
5733  allocated_memory.push_back(vals);
5734 
5735  //we insert the coordinates to the sort item here.
5736  int val_ind = 0;
5737 
5738  if (longest_dim_part){
5739  //std::cout << std::endl << std::endl;
5740  for(int dim = this->coord_dim - 2; dim >= 0; --dim){
5741  //uSignedSortItem<int, mj_scalar_t, char> *p_coord_dimension_range_sorted
5742  int next_largest_coord_dim = p_coord_dimension_range_sorted[dim].id;
5743  //std::cout << "next_largest_coord_dim: " << next_largest_coord_dim << " ";
5744  vals[val_ind++] = this->mj_coordinates[next_largest_coord_dim][i];
5745  }
5746  }
5747  else {
5748  for(int dim = coordInd + 1; dim < this->coord_dim; ++dim){
5749  vals[val_ind++] = this->mj_coordinates[dim][i];
5750  }
5751  for(int dim = 0; dim < coordInd; ++dim){
5752  vals[val_ind++] = this->mj_coordinates[dim][i];
5753  }
5754  }
5755  multiSItem tempSortItem(i, this->coord_dim -1, vals);
5756  //inser the point to the sort vector pointed by the cut_map[p].
5757  mj_part_t cmap = cut_map[p];
5758  sort_vector_points_on_cut[cmap].push_back(tempSortItem);
5759  }
5760  else {
5761  //if it is not on the cut, simple sorting.
5762  ++thread_num_points_in_parts[p];
5763  this->assigned_part_ids[i] = p;
5764  }
5765  }
5766 
5767  //sort all the sort vectors.
5768  for (mj_part_t i = 0; i < different_cut_count; ++i){
5769  std::sort (sort_vector_points_on_cut[i].begin(), sort_vector_points_on_cut[i].end());
5770  }
5771 
5772  //we do the part assignment for the points on cuts here.
5773  mj_part_t previous_cut_map = cut_map[0];
5774 
5775  //this is how much previous part owns the weight of the current part.
5776  //when target part weight is 1.6, and the part on the left is given 2,
5777  //the left has an extra 0.4, while the right has missing 0.4 from the previous cut.
5778  //this parameter is used to balance this issues.
5779  //in the above example weight_stolen_from_previous_part will be 0.4.
5780  //if the left part target is 2.2 but it is given 2,
5781  //then weight_stolen_from_previous_part will be -0.2.
5782  mj_scalar_t weight_stolen_from_previous_part = 0;
5783  for (mj_part_t p = 0; p < no_cuts; ++p){
5784 
5785  mj_part_t mapped_cut = cut_map[p];
5786 
5787  //if previous cut map is done, and it does not have the same index,
5788  //then assign all points left on that cut to its right.
5789  if (previous_cut_map != mapped_cut){
5790  mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[previous_cut_map].size() - 1;
5791  for (; sort_vector_end >= 0; --sort_vector_end){
5792  multiSItem t = sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
5793  mj_lno_t i = t.index;
5794  ++thread_num_points_in_parts[p];
5795  this->assigned_part_ids[i] = p;
5796  }
5797  sort_vector_points_on_cut[previous_cut_map].clear();
5798  }
5799 
5800  //TODO: MD: I dont remember why I have it reverse order here.
5801  mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[mapped_cut].size() - 1;
5802  //mj_lno_t sort_vector_begin= 0;
5803  //mj_lno_t sort_vector_size = (mj_lno_t)sort_vector_points_on_cut[mapped_cut].size();
5804 
5805  //TODO commented for reverse order
5806  for (; sort_vector_end >= 0; --sort_vector_end){
5807  //for (; sort_vector_begin < sort_vector_size; ++sort_vector_begin){
5808  //TODO COMMENTED FOR REVERSE ORDER
5809  multiSItem t = sort_vector_points_on_cut[mapped_cut][sort_vector_end];
5810  //multiSItem t = sort_vector_points_on_cut[mapped_cut][sort_vector_begin];
5811  mj_lno_t i = t.index;
5812  mj_scalar_t w = this->mj_uniform_weights[0]? 1:this->mj_weights[0][i];
5813 
5814 
5815  //part p has enough space for point i, then put it to point i.
5816  if( my_local_thread_cut_weights_to_put_left[p] + weight_stolen_from_previous_part> this->sEpsilon &&
5817  my_local_thread_cut_weights_to_put_left[p] + weight_stolen_from_previous_part - ZOLTAN2_ABS(my_local_thread_cut_weights_to_put_left[p] + weight_stolen_from_previous_part - w)
5818  > this->sEpsilon){
5819 
5820  my_local_thread_cut_weights_to_put_left[p] -= w;
5821  sort_vector_points_on_cut[mapped_cut].pop_back();
5822  ++thread_num_points_in_parts[p];
5823  this->assigned_part_ids[i] = p;
5824  //if putting this weight to left overweights the left cut, then
5825  //increase the space for the next cut using weight_stolen_from_previous_part.
5826  if(p < no_cuts - 1 && my_local_thread_cut_weights_to_put_left[p] < this->sEpsilon){
5827  if(mapped_cut == cut_map[p + 1] ){
5828  //if the cut before the cut indexed at p was also at the same position
5829  //special case, as we handle the weight differently here.
5830  if (previous_cut_map != mapped_cut){
5831  weight_stolen_from_previous_part = my_local_thread_cut_weights_to_put_left[p];
5832  }
5833  else {
5834  //if the cut before the cut indexed at p was also at the same position
5835  //we assign extra weights cumulatively in this case.
5836  weight_stolen_from_previous_part += my_local_thread_cut_weights_to_put_left[p];
5837  }
5838  }
5839  else{
5840  weight_stolen_from_previous_part = -my_local_thread_cut_weights_to_put_left[p];
5841  }
5842  //end assignment for part p
5843  break;
5844  }
5845  } else {
5846  //if part p does not have enough space for this point
5847  //and if there is another cut sharing the same positon,
5848  //again increase the space for the next
5849  if(p < no_cuts - 1 && mapped_cut == cut_map[p + 1]){
5850  if (previous_cut_map != mapped_cut){
5851  weight_stolen_from_previous_part = my_local_thread_cut_weights_to_put_left[p];
5852  }
5853  else {
5854  weight_stolen_from_previous_part += my_local_thread_cut_weights_to_put_left[p];
5855  }
5856  }
5857  else{
5858  weight_stolen_from_previous_part = -my_local_thread_cut_weights_to_put_left[p];
5859  }
5860  //end assignment for part p
5861  break;
5862  }
5863  }
5864  previous_cut_map = mapped_cut;
5865  }
5866 
5867  //TODO commented for reverse order
5868  //put everything left on the last cut to the last part.
5869  mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[previous_cut_map].size() - 1;
5870 
5871  //mj_lno_t sort_vector_begin= 0;
5872  //mj_lno_t sort_vector_size = (mj_lno_t)sort_vector_points_on_cut[previous_cut_map].size();
5873  //TODO commented for reverse order
5874  for (; sort_vector_end >= 0; --sort_vector_end){
5875  //for (; sort_vector_begin < sort_vector_size; ++sort_vector_begin){
5876  //TODO commented for reverse order
5877  multiSItem t = sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
5878  //multiSItem t = sort_vector_points_on_cut[previous_cut_map][sort_vector_begin];
5879  mj_lno_t i = t.index;
5880  ++thread_num_points_in_parts[no_cuts];
5881  this->assigned_part_ids[i] = no_cuts;
5882  }
5883  sort_vector_points_on_cut[previous_cut_map].clear();
5884  freeArray<mj_part_t> (cut_map);
5885 
5886  //free the memory allocated for vertex sort items .
5887  mj_lno_t vSize = (mj_lno_t) allocated_memory.size();
5888  for(mj_lno_t i = 0; i < vSize; ++i){
5889  freeArray<mj_scalar_t> (allocated_memory[i]);
5890  }
5891 
5892  //creation of part_xadj as in usual case.
5893  for(mj_part_t j = 0; j < num_parts; ++j){
5894  mj_lno_t num_points_in_part_j_upto_thread_i = 0;
5895  for (int i = 0; i < this->num_threads; ++i){
5896  mj_lno_t thread_num_points_in_part_j = this->thread_point_counts[i][j];
5897  this->thread_point_counts[i][j] = num_points_in_part_j_upto_thread_i;
5898  num_points_in_part_j_upto_thread_i += thread_num_points_in_part_j;
5899 
5900  }
5901  out_part_xadj[j] = num_points_in_part_j_upto_thread_i;// + prev2; //+ coordinateBegin;
5902  }
5903 
5904  //perform prefix sum for num_points in parts.
5905  for(mj_part_t j = 1; j < num_parts; ++j){
5906  out_part_xadj[j] += out_part_xadj[j - 1];
5907  }
5908 
5909 
5910  //shift the num points in threads thread to obtain the
5911  //beginning index of each thread's private space.
5912  for(mj_part_t j = 1; j < num_parts; ++j){
5913  thread_num_points_in_parts[j] += out_part_xadj[j - 1] ;
5914  }
5915 
5916  //now thread gets the coordinate and writes the index of coordinate to the permutation array
5917  //using the part index we calculated.
5918  for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
5919  mj_lno_t i = this->coordinate_permutations[ii];
5920  mj_part_t p = this->assigned_part_ids[i];
5921  this->new_coordinate_permutations[coordinate_begin +
5922  thread_num_points_in_parts[p]++] = i;
5923  }
5924 }
5925 
5926 
5927 
5937 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5938  typename mj_part_t>
5939 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::set_final_parts(
5940  mj_part_t current_num_parts,
5941  mj_part_t output_part_begin_index,
5942  RCP<mj_partBoxVector_t> &output_part_boxes,
5943  bool is_data_ever_migrated)
5944 {
5945  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Part_Assignment");
5946 
5947 #ifdef HAVE_ZOLTAN2_OMP
5948 #pragma omp parallel for
5949 #endif
5950  for(mj_part_t i = 0; i < current_num_parts;++i){
5951 
5952  mj_lno_t begin = 0;
5953  mj_lno_t end = this->part_xadj[i];
5954 
5955  if(i > 0) begin = this->part_xadj[i -1];
5956  mj_part_t part_to_set_index = i + output_part_begin_index;
5957  if (this->mj_keep_part_boxes){
5958  (*output_part_boxes)[i].setpId(part_to_set_index);
5959  }
5960  for (mj_lno_t ii = begin; ii < end; ++ii){
5961  mj_lno_t k = this->coordinate_permutations[ii];
5962  this->assigned_part_ids[k] = part_to_set_index;
5963  }
5964  }
5965 
5966  //ArrayRCP<const mj_gno_t> gnoList;
5967  if(!is_data_ever_migrated){
5968  //freeArray<mj_gno_t>(this->current_mj_gnos);
5969  //if(this->num_local_coords > 0){
5970  // gnoList = arcpFromArrayView(this->mj_gnos);
5971  //}
5972  }
5973  else {
5974 #ifdef ENABLE_ZOLTAN_MIGRATION
5975  if (sizeof(mj_lno_t) <= sizeof(int)) {
5976 
5977  // Cannot use Zoltan_Comm with local ordinals larger than ints.
5978  // In Zoltan_Comm_Create, the cast int(this->num_local_coords)
5979  // may overflow.
5980 
5981  //if data is migrated, then send part numbers to the original owners.
5982  ZOLTAN_COMM_OBJ *plan = NULL;
5983  MPI_Comm mpi_comm = Teuchos::getRawMpiComm(*(this->mj_problemComm));
5984 
5985  int incoming = 0;
5986  int message_tag = 7856;
5987 
5988  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Final Z1PlanCreating");
5989  int ierr = Zoltan_Comm_Create( &plan, int(this->num_local_coords),
5990  this->owner_of_coordinate, mpi_comm, message_tag,
5991  &incoming);
5992  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
5993  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Final Z1PlanCreating" );
5994 
5995  mj_gno_t *incoming_gnos = allocMemory< mj_gno_t>(incoming);
5996 
5997  message_tag++;
5998  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Final Z1PlanComm");
5999  ierr = Zoltan_Comm_Do( plan, message_tag, (char *) this->current_mj_gnos,
6000  sizeof(mj_gno_t), (char *) incoming_gnos);
6001  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6002 
6003  freeArray<mj_gno_t>(this->current_mj_gnos);
6004  this->current_mj_gnos = incoming_gnos;
6005 
6006  mj_part_t *incoming_partIds = allocMemory< mj_part_t>(incoming);
6007 
6008  message_tag++;
6009  ierr = Zoltan_Comm_Do( plan, message_tag, (char *) this->assigned_part_ids,
6010  sizeof(mj_part_t), (char *) incoming_partIds);
6011  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6012  freeArray<mj_part_t>(this->assigned_part_ids);
6013  this->assigned_part_ids = incoming_partIds;
6014 
6015  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Final Z1PlanComm");
6016  ierr = Zoltan_Comm_Destroy(&plan);
6017  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6018 
6019  this->num_local_coords = incoming;
6020  //gnoList = arcp(this->current_mj_gnos, 0, this->num_local_coords, true);
6021  }
6022  else
6023 
6024 #endif // !ENABLE_ZOLTAN_MIGRATION
6025  {
6026  //if data is migrated, then send part numbers to the original owners.
6027  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Final DistributorPlanCreating");
6028  Tpetra::Distributor distributor(this->mj_problemComm);
6029  ArrayView<const mj_part_t> owners_of_coords(this->owner_of_coordinate, this->num_local_coords);
6030  mj_lno_t incoming = distributor.createFromSends(owners_of_coords);
6031  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Final DistributorPlanCreating" );
6032 
6033  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Final DistributorPlanComm");
6034  //migrate gnos to actual owners.
6035  ArrayRCP<mj_gno_t> received_gnos(incoming);
6036  ArrayView<mj_gno_t> sent_gnos(this->current_mj_gnos, this->num_local_coords);
6037  distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
6038  freeArray<mj_gno_t>(this->current_mj_gnos);
6039  this->current_mj_gnos = allocMemory<mj_gno_t>(incoming);
6040  memcpy( this->current_mj_gnos,
6041  received_gnos.getRawPtr(),
6042  incoming * sizeof(mj_gno_t));
6043 
6044  //migrate part ids to actual owners.
6045  ArrayView<mj_part_t> sent_partids(this->assigned_part_ids, this->num_local_coords);
6046  ArrayRCP<mj_part_t> received_partids(incoming);
6047  distributor.doPostsAndWaits<mj_part_t>(sent_partids, 1, received_partids());
6048  freeArray<mj_part_t>(this->assigned_part_ids);
6049  this->assigned_part_ids = allocMemory<mj_part_t>(incoming);
6050  memcpy( this->assigned_part_ids,
6051  received_partids.getRawPtr(),
6052  incoming * sizeof(mj_part_t));
6053 
6054  this->num_local_coords = incoming;
6055  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Final DistributorPlanComm");
6056 
6057  }
6058  }
6059 
6060  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Part_Assignment");
6061 
6062  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Solution_Part_Assignment");
6063 
6064  //ArrayRCP<mj_part_t> partId;
6065  //partId = arcp(this->assigned_part_ids, 0, this->num_local_coords, true);
6066 
6067  if (this->mj_keep_part_boxes){
6068  this->kept_boxes = compute_global_box_boundaries(output_part_boxes);
6069 
6070  }
6071 
6072  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Solution_Part_Assignment");
6073 }
6074 
6077 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6078  typename mj_part_t>
6079 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::free_work_memory(){
6080  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Problem_Free");
6081 
6082  for (int i=0; i < this->coord_dim; i++){
6083  freeArray<mj_scalar_t>(this->mj_coordinates[i]);
6084  }
6085  freeArray<mj_scalar_t *>(this->mj_coordinates);
6086 
6087  for (int i=0; i < this->num_weights_per_coord; i++){
6088  freeArray<mj_scalar_t>(this->mj_weights[i]);
6089  }
6090  freeArray<mj_scalar_t *>(this->mj_weights);
6091 
6092  freeArray<int>(this->owner_of_coordinate);
6093 
6094  for(int i = 0; i < this->num_threads; ++i){
6095  freeArray<mj_lno_t>(this->thread_point_counts[i]);
6096  }
6097 
6098  freeArray<mj_lno_t *>(this->thread_point_counts);
6099  freeArray<double *> (this->thread_part_weight_work);
6100 
6101  if(this->distribute_points_on_cut_lines){
6102  freeArray<mj_scalar_t>(this->process_cut_line_weight_to_put_left);
6103  for(int i = 0; i < this->num_threads; ++i){
6104  freeArray<mj_scalar_t>(this->thread_cut_line_weight_to_put_left[i]);
6105  }
6106  freeArray<mj_scalar_t *>(this->thread_cut_line_weight_to_put_left);
6107  freeArray<mj_scalar_t>(this->process_rectilinear_cut_weight);
6108  freeArray<mj_scalar_t>(this->global_rectilinear_cut_weight);
6109  }
6110 
6111  freeArray<mj_part_t>(this->my_incomplete_cut_count);
6112 
6113  freeArray<mj_scalar_t>(this->max_min_coords);
6114 
6115  freeArray<mj_lno_t>(this->part_xadj);
6116 
6117  freeArray<mj_lno_t>(this->coordinate_permutations);
6118 
6119  freeArray<mj_lno_t>(this->new_coordinate_permutations);
6120 
6121  freeArray<mj_scalar_t>(this->all_cut_coordinates);
6122 
6123  freeArray<mj_scalar_t> (this->process_local_min_max_coord_total_weight);
6124 
6125  freeArray<mj_scalar_t> (this->global_min_max_coord_total_weight);
6126 
6127  freeArray<mj_scalar_t>(this->cut_coordinates_work_array);
6128 
6129  freeArray<mj_scalar_t>(this->target_part_weights);
6130 
6131  freeArray<mj_scalar_t>(this->cut_upper_bound_coordinates);
6132 
6133  freeArray<mj_scalar_t>(this->cut_lower_bound_coordinates);
6134 
6135  freeArray<mj_scalar_t>(this->cut_lower_bound_weights);
6136  freeArray<mj_scalar_t>(this->cut_upper_bound_weights);
6137  freeArray<bool>(this->is_cut_line_determined);
6138  freeArray<mj_scalar_t>(this->total_part_weight_left_right_closests);
6139  freeArray<mj_scalar_t>(this->global_total_part_weight_left_right_closests);
6140 
6141  for(int i = 0; i < this->num_threads; ++i){
6142  freeArray<double>(this->thread_part_weights[i]);
6143  freeArray<mj_scalar_t>(this->thread_cut_right_closest_point[i]);
6144  freeArray<mj_scalar_t>(this->thread_cut_left_closest_point[i]);
6145  }
6146 
6147  freeArray<double *>(this->thread_part_weights);
6148  freeArray<mj_scalar_t *>(this->thread_cut_left_closest_point);
6149  freeArray<mj_scalar_t *>(this->thread_cut_right_closest_point);
6150 
6151  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Problem_Free");
6152 }
6153 
6162 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6163  typename mj_part_t>
6165  bool distribute_points_on_cut_lines_,
6166  int max_concurrent_part_calculation_,
6167  int check_migrate_avoid_migration_option_,
6168  double minimum_migration_imbalance_,
6169  int migration_type_ ){
6170  this->distribute_points_on_cut_lines = distribute_points_on_cut_lines_;
6171  this->max_concurrent_part_calculation = max_concurrent_part_calculation_;
6172  this->check_migrate_avoid_migration_option = check_migrate_avoid_migration_option_;
6173  this->minimum_migration_imbalance = minimum_migration_imbalance_;
6174  this->migration_type = migration_type_;
6175 
6176 }
6177 
6178 
6179 
6180 
6208 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6209  typename mj_part_t>
6211 
6212  const RCP<const Environment> &env,
6213  RCP<const Comm<int> > &problemComm,
6214 
6215  double imbalance_tolerance_,
6216  size_t num_global_parts_,
6217  mj_part_t *part_no_array_,
6218  int recursion_depth_,
6219 
6220  int coord_dim_,
6221  mj_lno_t num_local_coords_,
6222  mj_gno_t num_global_coords_,
6223  const mj_gno_t *initial_mj_gnos_,
6224  mj_scalar_t **mj_coordinates_,
6225 
6226  int num_weights_per_coord_,
6227  bool *mj_uniform_weights_,
6228  mj_scalar_t **mj_weights_,
6229  bool *mj_uniform_parts_,
6230  mj_scalar_t **mj_part_sizes_,
6231 
6232  mj_part_t *&result_assigned_part_ids_,
6233  mj_gno_t *&result_mj_gnos_
6234 )
6235 {
6236 
6237 
6238 
6239 #ifdef print_debug
6240  if(comm->getRank() == 0){
6241  std::cout << "size of gno:" << sizeof(mj_gno_t) << std::endl;
6242  std::cout << "size of lno:" << sizeof(mj_lno_t) << std::endl;
6243  std::cout << "size of mj_scalar_t:" << sizeof(mj_scalar_t) << std::endl;
6244  }
6245 #endif
6246  this->mj_env = env;
6247  this->mj_problemComm = problemComm;
6248  this->myActualRank = this->myRank = this->mj_problemComm->getRank();
6249 
6250  /*
6251  if (0)
6252  {
6253  int a = rand();
6254  this->mj_problemComm->broadcast(0, sizeof(int), (char *) (&a));
6255  std::string istring = "output_" + Teuchos::toString<int>(a) + "_" + Teuchos::toString<int>(myRank) + ".mtx";
6256 
6257  std::ofstream output(istring.c_str());
6258  output << num_local_coords_ << " " << coord_dim_ << std::endl;
6259  for (int j = 0; j < coord_dim_ ; ++j){
6260  for (size_t i = 0; i < num_local_coords_; ++i){
6261  output << mj_coordinates_[j][i] << std::endl;
6262  }
6263 
6264  }
6265  output.close();
6266  }
6267  */
6268 
6269 
6270  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Total");
6271  this->mj_env->debug(3, "In MultiJagged Jagged");
6272 
6273  {
6274  this->imbalance_tolerance = imbalance_tolerance_;
6275  this->num_global_parts = num_global_parts_;
6276  this->part_no_array = part_no_array_;
6277  this->recursion_depth = recursion_depth_;
6278 
6279  this->coord_dim = coord_dim_;
6280  this->num_local_coords = num_local_coords_;
6281  this->num_global_coords = num_global_coords_;
6282  this->mj_coordinates = mj_coordinates_; //will copy the memory to this->mj_coordinates.
6283  this->initial_mj_gnos = (mj_gno_t *) initial_mj_gnos_; //will copy the memory to this->current_mj_gnos[j].
6284 
6285  this->num_weights_per_coord = num_weights_per_coord_;
6286  this->mj_uniform_weights = mj_uniform_weights_;
6287  this->mj_weights = mj_weights_; //will copy the memory to this->mj_weights
6288  this->mj_uniform_parts = mj_uniform_parts_;
6289  this->mj_part_sizes = mj_part_sizes_;
6290 
6291  this->num_threads = 1;
6292 #ifdef HAVE_ZOLTAN2_OMP
6293 #pragma omp parallel
6294 
6295  {
6296  this->num_threads = omp_get_num_threads();
6297  }
6298 #endif
6299  }
6300  //this->set_input_data();
6301  this->set_part_specifications();
6302 
6303  this->allocate_set_work_memory();
6304 
6305  //We duplicate the comm as we create subcommunicators during migration.
6306  //We keep the problemComm as it is, while comm changes after each migration.
6307  this->comm = this->mj_problemComm->duplicate();
6308 
6309  //initially there is a single partition
6310  mj_part_t current_num_parts = 1;
6311  mj_scalar_t *current_cut_coordinates = this->all_cut_coordinates;
6312 
6313  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Problem_Partitioning");
6314 
6315  mj_part_t output_part_begin_index = 0;
6316  mj_part_t future_num_parts = this->total_num_part;
6317  bool is_data_ever_migrated = false;
6318 
6319  std::vector<mj_part_t> *future_num_part_in_parts = new std::vector<mj_part_t> ();
6320  std::vector<mj_part_t> *next_future_num_parts_in_parts = new std::vector<mj_part_t> ();
6321  next_future_num_parts_in_parts->push_back(this->num_global_parts);
6322 
6323  RCP<mj_partBoxVector_t> input_part_boxes(new mj_partBoxVector_t(), true) ;
6324  RCP<mj_partBoxVector_t> output_part_boxes(new mj_partBoxVector_t(), true);
6325 
6326  compute_global_box();
6327  if(this->mj_keep_part_boxes){
6328  this->init_part_boxes(output_part_boxes);
6329  }
6330 
6331  for (int i = 0; i < this->recursion_depth; ++i){
6332  //partitioning array. size will be as the number of current partitions and this
6333  //holds how many parts that each part will be in the current dimension partitioning.
6334  std::vector <mj_part_t> num_partitioning_in_current_dim;
6335 
6336  //number of parts that will be obtained at the end of this partitioning.
6337  //future_num_part_in_parts is as the size of current number of parts.
6338  //holds how many more parts each should be divided in the further
6339  //iterations. this will be used to calculate num_partitioning_in_current_dim,
6340  //as the number of parts that the part will be partitioned
6341  //in the current dimension partitioning.
6342 
6343  //next_future_num_parts_in_parts will be as the size of outnumParts,
6344  //and this will hold how many more parts that each output part
6345  //should be divided. this array will also be used to determine the weight ratios
6346  //of the parts.
6347  //swap the arrays to use iteratively..
6348  std::vector<mj_part_t> *tmpPartVect= future_num_part_in_parts;
6349  future_num_part_in_parts = next_future_num_parts_in_parts;
6350  next_future_num_parts_in_parts = tmpPartVect;
6351 
6352  //clear next_future_num_parts_in_parts array as
6353  //getPartitionArrays expects it to be empty.
6354  //it also expects num_partitioning_in_current_dim to be empty as well.
6355  next_future_num_parts_in_parts->clear();
6356 
6357  if(this->mj_keep_part_boxes){
6358  RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
6359  input_part_boxes = output_part_boxes;
6360  output_part_boxes = tmpPartBoxes;
6361  output_part_boxes->clear();
6362  }
6363 
6364  //returns the total no. of output parts for this dimension partitioning.
6365  mj_part_t output_part_count_in_dimension =
6366  this->update_part_num_arrays(
6367  num_partitioning_in_current_dim,
6368  future_num_part_in_parts,
6369  next_future_num_parts_in_parts,
6370  future_num_parts,
6371  current_num_parts,
6372  i,
6373  input_part_boxes,
6374  output_part_boxes, 1);
6375 
6376  //if the number of obtained parts equal to current number of parts,
6377  //skip this dimension. For example, this happens when 1 is given in the input
6378  //part array is given. P=4,5,1,2
6379  if(output_part_count_in_dimension == current_num_parts) {
6380  //still need to swap the input output arrays.
6381  tmpPartVect= future_num_part_in_parts;
6382  future_num_part_in_parts = next_future_num_parts_in_parts;
6383  next_future_num_parts_in_parts = tmpPartVect;
6384 
6385  if(this->mj_keep_part_boxes){
6386  RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
6387  input_part_boxes = output_part_boxes;
6388  output_part_boxes = tmpPartBoxes;
6389  }
6390  continue;
6391  }
6392 
6393 
6394  //get the coordinate axis along which the partitioning will be done.
6395  int coordInd = i % this->coord_dim;
6396  mj_scalar_t * mj_current_dim_coords = this->mj_coordinates[coordInd];
6397 
6398  //convert i to string to be used for debugging purposes.
6399  std::string istring = Teuchos::toString<int>(i);
6400  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Problem_Partitioning_" + istring);
6401 
6402  //alloc Memory to point the indices
6403  //of the parts in the permutation array.
6404  this->new_part_xadj = allocMemory<mj_lno_t>(output_part_count_in_dimension);
6405 
6406  //the index where in the new_part_xadj will be written.
6407  mj_part_t output_part_index = 0;
6408  //whatever is written to output_part_index will be added with putput_coordinate_end_index
6409  //so that the points will be shifted.
6410  mj_part_t output_coordinate_end_index = 0;
6411 
6412  mj_part_t current_work_part = 0;
6413  mj_part_t current_concurrent_num_parts =
6414  std::min(current_num_parts - current_work_part, this->max_concurrent_part_calculation);
6415 
6416  mj_part_t obtained_part_index = 0;
6417 
6418  //run for all available parts.
6419  for (; current_work_part < current_num_parts;
6420  current_work_part += current_concurrent_num_parts){
6421 
6422  current_concurrent_num_parts = std::min(current_num_parts - current_work_part,
6423  this->max_concurrent_part_calculation);
6424 
6425  mj_part_t actual_work_part_count = 0;
6426  //initialization for 1D partitioning.
6427  //get the min and max coordinates of each part
6428  //together with the part weights of each part.
6429  for(int kk = 0; kk < current_concurrent_num_parts; ++kk){
6430  mj_part_t current_work_part_in_concurrent_parts = current_work_part + kk;
6431 
6432  //if this part wont be partitioned any further
6433  //dont do any work for this part.
6434  if (num_partitioning_in_current_dim[current_work_part_in_concurrent_parts] == 1){
6435  continue;
6436  }
6437  ++actual_work_part_count;
6438  mj_lno_t coordinate_end_index= this->part_xadj[current_work_part_in_concurrent_parts];
6439  mj_lno_t coordinate_begin_index = current_work_part_in_concurrent_parts == 0 ?
6440  0 : this->part_xadj[current_work_part_in_concurrent_parts - 1];
6441 
6442 /*
6443  std::cout << "i:" << i << " j:" << current_work_part + kk
6444  << " coordinate_begin_index:" << coordinate_begin_index
6445  << " coordinate_end_index:" << coordinate_end_index
6446  << " total:" << coordinate_end_index - coordinate_begin_index<< std::endl;
6447  */
6448  this->mj_get_local_min_max_coord_totW(
6449  coordinate_begin_index,
6450  coordinate_end_index,
6451  this->coordinate_permutations,
6452  mj_current_dim_coords,
6453  this->process_local_min_max_coord_total_weight[kk], //min_coordinate
6454  this->process_local_min_max_coord_total_weight[kk + current_concurrent_num_parts], //max_coordinate
6455  this->process_local_min_max_coord_total_weight[kk + 2*current_concurrent_num_parts]); //total_weight
6456 
6457  }
6458 
6459  //1D partitioning
6460  if (actual_work_part_count > 0){
6461  //obtain global Min max of the part.
6462  this->mj_get_global_min_max_coord_totW(
6463  current_concurrent_num_parts,
6464  this->process_local_min_max_coord_total_weight,
6465  this->global_min_max_coord_total_weight);
6466 
6467  //represents the total number of cutlines
6468  //whose coordinate should be determined.
6469  mj_part_t total_incomplete_cut_count = 0;
6470 
6471  //Compute weight ratios for parts & cuts:
6472  //e.g., 0.25 0.25 0.5 0.5 0.75 0.75 1
6473  //part0 cut0 part1 cut1 part2 cut2 part3
6474  mj_part_t concurrent_part_cut_shift = 0;
6475  mj_part_t concurrent_part_part_shift = 0;
6476  for(int kk = 0; kk < current_concurrent_num_parts; ++kk){
6477  mj_scalar_t min_coordinate = this->global_min_max_coord_total_weight[kk];
6478  mj_scalar_t max_coordinate = this->global_min_max_coord_total_weight[kk +
6479  current_concurrent_num_parts];
6480 
6481  mj_scalar_t global_total_weight = this->global_min_max_coord_total_weight[kk +
6482  2 * current_concurrent_num_parts];
6483 
6484  mj_part_t concurrent_current_part_index = current_work_part + kk;
6485 
6486  mj_part_t partition_count = num_partitioning_in_current_dim[concurrent_current_part_index];
6487 
6488  mj_scalar_t *usedCutCoordinate = current_cut_coordinates + concurrent_part_cut_shift;
6489  mj_scalar_t *current_target_part_weights = this->target_part_weights +
6490  concurrent_part_part_shift;
6491  //shift the usedCutCoordinate array as noCuts.
6492  concurrent_part_cut_shift += partition_count - 1;
6493  //shift the partRatio array as noParts.
6494  concurrent_part_part_shift += partition_count;
6495 
6496 
6497  //calculate only if part is not empty,
6498  //and part will be further partitioned.
6499  if(partition_count > 1 && min_coordinate <= max_coordinate){
6500 
6501  //increase num_cuts_do_be_determined by the number of cuts of the current
6502  //part's cut line number.
6503  total_incomplete_cut_count += partition_count - 1;
6504  //set the number of cut lines that should be determined
6505  //for this part.
6506  this->my_incomplete_cut_count[kk] = partition_count - 1;
6507 
6508  //get the target weights of the parts.
6509  this->mj_get_initial_cut_coords_target_weights(
6510  min_coordinate,
6511  max_coordinate,
6512  partition_count - 1,
6513  global_total_weight,
6514  usedCutCoordinate,
6515  current_target_part_weights,
6516  future_num_part_in_parts,
6517  next_future_num_parts_in_parts,
6518  concurrent_current_part_index,
6519  obtained_part_index);
6520 
6521  mj_lno_t coordinate_end_index= this->part_xadj[concurrent_current_part_index];
6522  mj_lno_t coordinate_begin_index = concurrent_current_part_index == 0 ?
6523  0 : this->part_xadj[concurrent_current_part_index - 1];
6524 
6525  //get the initial estimated part assignments of the
6526  //coordinates.
6527  this->set_initial_coordinate_parts(
6528  max_coordinate,
6529  min_coordinate,
6530  concurrent_current_part_index,
6531  coordinate_begin_index, coordinate_end_index,
6532  this->coordinate_permutations,
6533  mj_current_dim_coords,
6534  this->assigned_part_ids,
6535  partition_count);
6536  }
6537  else {
6538  // e.g., if have fewer coordinates than parts, don't need to do next dim.
6539  this->my_incomplete_cut_count[kk] = 0;
6540  }
6541  obtained_part_index += partition_count;
6542  }
6543 
6544 
6545 
6546  //used imbalance, it is always 0, as it is difficult to
6547  //estimate a range.
6548  double used_imbalance = 0;
6549 
6550 
6551  // Determine cut lines for all concurrent parts parts here.
6552  this->mj_1D_part(
6553  mj_current_dim_coords,
6554  used_imbalance,
6555  current_work_part,
6556  current_concurrent_num_parts,
6557  current_cut_coordinates,
6558  total_incomplete_cut_count,
6559  num_partitioning_in_current_dim);
6560  }
6561 
6562  //create new part chunks
6563  {
6564  mj_part_t output_array_shift = 0;
6565  mj_part_t cut_shift = 0;
6566  size_t tlr_shift = 0;
6567  size_t partweight_array_shift = 0;
6568 
6569  for(int kk = 0; kk < current_concurrent_num_parts; ++kk){
6570  mj_part_t current_concurrent_work_part = current_work_part + kk;
6571  mj_part_t num_parts = num_partitioning_in_current_dim[current_concurrent_work_part];
6572 
6573  //if the part is empty, skip the part.
6574  if((num_parts != 1 )
6575  &&
6576  this->global_min_max_coord_total_weight[kk] >
6577  this->global_min_max_coord_total_weight[kk + current_concurrent_num_parts]) {
6578 
6579  //we still need to write the begin and end point of the
6580  //empty part. simply set it zero, the array indices will be shifted later.
6581  for(mj_part_t jj = 0; jj < num_parts; ++jj){
6582  this->new_part_xadj[output_part_index + output_array_shift + jj] = 0;
6583  }
6584  cut_shift += num_parts - 1;
6585  tlr_shift += (4 *(num_parts - 1) + 1);
6586  output_array_shift += num_parts;
6587  partweight_array_shift += (2 * (num_parts - 1) + 1);
6588  continue;
6589  }
6590 
6591  mj_lno_t coordinate_end= this->part_xadj[current_concurrent_work_part];
6592  mj_lno_t coordinate_begin = current_concurrent_work_part==0 ? 0: this->part_xadj[
6593  current_concurrent_work_part -1];
6594  mj_scalar_t *current_concurrent_cut_coordinate = current_cut_coordinates + cut_shift;
6595  mj_scalar_t *used_local_cut_line_weight_to_left = this->process_cut_line_weight_to_put_left +
6596  cut_shift;
6597 
6598  //mj_scalar_t *used_tlr_array = this->total_part_weight_left_right_closests + tlr_shift;
6599 
6600  for(int ii = 0; ii < this->num_threads; ++ii){
6601  this->thread_part_weight_work[ii] = this->thread_part_weights[ii] + partweight_array_shift;
6602  }
6603 
6604  if(num_parts > 1){
6605  if(this->mj_keep_part_boxes){
6606  //if part boxes are to be stored update the boundaries.
6607  for (mj_part_t j = 0; j < num_parts - 1; ++j){
6608  (*output_part_boxes)[output_array_shift + output_part_index +
6609  j].updateMinMax(current_concurrent_cut_coordinate[j], 1
6610  /*update max*/, coordInd);
6611 
6612  (*output_part_boxes)[output_array_shift + output_part_index + j +
6613  1].updateMinMax(current_concurrent_cut_coordinate[j], 0
6614  /*update min*/, coordInd);
6615  }
6616  }
6617 
6618  // Rewrite the indices based on the computed cuts.
6619  this->mj_create_new_partitions(
6620  num_parts,
6621  mj_current_dim_coords,
6622  current_concurrent_cut_coordinate,
6623  coordinate_begin,
6624  coordinate_end,
6625  used_local_cut_line_weight_to_left,
6626  this->thread_part_weight_work,
6627  this->new_part_xadj + output_part_index + output_array_shift
6628  );
6629 
6630  }
6631  else {
6632  //if this part is partitioned into 1 then just copy
6633  //the old values.
6634  mj_lno_t part_size = coordinate_end - coordinate_begin;
6635  *(this->new_part_xadj + output_part_index + output_array_shift) = part_size;
6636  memcpy(
6637  this->new_coordinate_permutations + coordinate_begin,
6638  this->coordinate_permutations + coordinate_begin,
6639  part_size * sizeof(mj_lno_t));
6640  }
6641  cut_shift += num_parts - 1;
6642  tlr_shift += (4 *(num_parts - 1) + 1);
6643  output_array_shift += num_parts;
6644  partweight_array_shift += (2 * (num_parts - 1) + 1);
6645  }
6646 
6647  //shift cut coordinates so that all cut coordinates are stored.
6648  //no shift now because we dont keep the cuts.
6649  //current_cut_coordinates += cut_shift;
6650 
6651  //mj_create_new_partitions from coordinates partitioned the parts and
6652  //write the indices as if there were a single part.
6653  //now we need to shift the beginning indices.
6654  for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
6655  mj_part_t num_parts = num_partitioning_in_current_dim[ current_work_part + kk];
6656  for (mj_part_t ii = 0;ii < num_parts ; ++ii){
6657  //shift it by previousCount
6658  this->new_part_xadj[output_part_index+ii] += output_coordinate_end_index;
6659  }
6660  //increase the previous count by current end.
6661  output_coordinate_end_index = this->new_part_xadj[output_part_index + num_parts - 1];
6662  //increase the current out.
6663  output_part_index += num_parts ;
6664  }
6665  }
6666  }
6667  // end of this partitioning dimension
6668 
6669 
6670  int current_world_size = this->comm->getSize();
6671  long migration_reduce_all_population = this->total_dim_num_reduce_all * current_world_size;
6672 
6673 
6674  bool is_migrated_in_current_dimension = false;
6675 
6676  //we migrate if there are more partitionings to be done after this step
6677  //and if the migration is not forced to be avoided.
6678  //and the operation is not sequential.
6679  if (future_num_parts > 1 &&
6680  this->check_migrate_avoid_migration_option >= 0 &&
6681  current_world_size > 1){
6682 
6683  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Problem_Migration-" + istring);
6684  mj_part_t num_parts = output_part_count_in_dimension;
6685  if ( this->mj_perform_migration(
6686  num_parts,
6687  current_num_parts, //output
6688  next_future_num_parts_in_parts, //output
6689  output_part_begin_index,
6690  migration_reduce_all_population,
6691  this->num_global_coords / (future_num_parts * current_num_parts),
6692  istring,
6693  input_part_boxes, output_part_boxes) ) {
6694  is_migrated_in_current_dimension = true;
6695  is_data_ever_migrated = true;
6696  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Problem_Migration-" +
6697  istring);
6698  //since data is migrated, we reduce the number of reduceAll operations for the last part.
6699  this->total_dim_num_reduce_all /= num_parts;
6700  }
6701  else {
6702  is_migrated_in_current_dimension = false;
6703  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Problem_Migration-" + istring);
6704  }
6705  }
6706 
6707  //swap the coordinate permutations for the next dimension.
6708  mj_lno_t * tmp = this->coordinate_permutations;
6709  this->coordinate_permutations = this->new_coordinate_permutations;
6710  this->new_coordinate_permutations = tmp;
6711 
6712  if(!is_migrated_in_current_dimension){
6713  this->total_dim_num_reduce_all -= current_num_parts;
6714  current_num_parts = output_part_count_in_dimension;
6715  }
6716  freeArray<mj_lno_t>(this->part_xadj);
6717  this->part_xadj = this->new_part_xadj;
6718  this->new_part_xadj = NULL;
6719  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Problem_Partitioning_" + istring);
6720  }
6721 
6722  // Partitioning is done
6723  delete future_num_part_in_parts;
6724  delete next_future_num_parts_in_parts;
6725 
6726  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Problem_Partitioning");
6728 
6729 
6730  //get the final parts of each initial coordinate
6731  //the results will be written to
6732  //this->assigned_part_ids for gnos given in this->current_mj_gnos
6733  this->set_final_parts(
6734  current_num_parts,
6735  output_part_begin_index,
6736  output_part_boxes,
6737  is_data_ever_migrated);
6738 
6739  result_assigned_part_ids_ = this->assigned_part_ids;
6740  result_mj_gnos_ = this->current_mj_gnos;
6741 
6742  this->free_work_memory();
6743  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Total");
6744  this->mj_env->debug(3, "Out of MultiJagged");
6745 
6746 }
6747 
6748 
6752 template <typename Adapter>
6753 class Zoltan2_AlgMJ : public Algorithm<Adapter>
6754 {
6755 private:
6756 
6757 #ifndef DOXYGEN_SHOULD_SKIP_THIS
6758 
6759  typedef CoordinateModel<typename Adapter::base_adapter_t> coordinateModel_t;
6760 
6761  // For coordinates and weights, MJ needs floats or doubles
6762  // But Adapter can provide other scalars, e.g., ints.
6763  // So have separate scalar_t for MJ and adapter.
6764  typedef typename Adapter::scalar_t adapter_scalar_t;
6765 
6766  // Provide a default type for mj_scalar_t;
6767  typedef float default_mj_scalar_t;
6768 
6769  // If Adapter provided float or double scalar_t, use it (prevents copies).
6770  // Otherwise, use the default type of mj_scalar_t;
6771  typedef typename
6772  std::conditional<
6773  (std::is_same<adapter_scalar_t, float>::value ||
6774  std::is_same<adapter_scalar_t, double>::value),
6775  adapter_scalar_t, default_mj_scalar_t>::type mj_scalar_t;
6776 
6777  typedef typename Adapter::gno_t mj_gno_t;
6778  typedef typename Adapter::lno_t mj_lno_t;
6779  typedef typename Adapter::node_t mj_node_t;
6780  typedef typename Adapter::part_t mj_part_t;
6781  typedef coordinateModelPartBox mj_partBox_t;
6782  typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
6783 #endif
6785 
6786  RCP<const Environment> mj_env; //the environment object
6787  RCP<const Comm<int> > mj_problemComm; //initial comm object
6788  RCP<const coordinateModel_t> mj_coords; //coordinate adapter
6789 
6790  //PARAMETERS
6791  double imbalance_tolerance; //input imbalance tolerance.
6792  size_t num_global_parts; //the targeted number of parts
6793  mj_part_t *part_no_array; //input part array specifying num part to divide along each dim.
6794  int recursion_depth; //the number of steps that partitioning will be solved in.
6795 
6796  int coord_dim; // coordinate dimension.
6797  mj_lno_t num_local_coords; //number of local coords.
6798  mj_gno_t num_global_coords; //number of global coords.
6799  const mj_gno_t *initial_mj_gnos; //initial global ids of the coordinates.
6800  mj_scalar_t **mj_coordinates; //two dimension coordinate array
6801 
6802  int num_weights_per_coord; // number of weights per coordinate
6803  bool *mj_uniform_weights; //if the coordinates have uniform weights.
6804  mj_scalar_t **mj_weights; //two dimensional weight array
6805  bool *mj_uniform_parts; //if the target parts are uniform
6806  mj_scalar_t **mj_part_sizes; //target part weight sizes.
6807 
6808  // Nonuniform first level partitioning
6809  // Currently used for Dragonfly task mapping by partitioning Dragonfly RCA
6810  // machine coordinates and application coordinates.
6811  // An optimization that completely partitions the most important machine dimension
6812  // first (i.e. the Dragonfly group coordinate, or RCA's x coordinate). The standard
6813  // MJ alg follows after the nonuniform first level partitioning.
6814  mj_part_t num_first_level_parts; // If used, number of parts for the first level partitioing
6815  const mj_part_t *first_level_distribution; // If used, the distribution of parts for the nonuniform first level partitioning
6816 
6817  bool distribute_points_on_cut_lines; //if partitioning can distribute points on same coordiante to different parts.
6818  mj_part_t max_concurrent_part_calculation; // how many parts we can calculate concurrently.
6819  int check_migrate_avoid_migration_option; //whether to migrate=1, avoid migrate=2, or leave decision to MJ=0
6820  int migration_type; // when doing the migration, 0 will aim for perfect load-imbalance,
6821  //1 for minimized messages
6822  double minimum_migration_imbalance; //when MJ decides whether to migrate, the minimum imbalance for migration.
6823  bool mj_keep_part_boxes; //if the boxes need to be kept.
6824 
6825  int num_threads;
6826 
6827  bool mj_run_as_rcb; //if this is set, then recursion depth is adjusted to its maximum value.
6828  int mj_premigration_option;
6829  int min_coord_per_rank_for_premigration;
6830 
6831  ArrayRCP<mj_part_t> comXAdj_; //communication graph xadj
6832  ArrayRCP<mj_part_t> comAdj_; //communication graph adj.
6833 
6834 
6835  //when we have strided data, it returns a unstrided data in RCP form.
6836  //we need to hold on to that data, during the execution of mj, so that the data is not released.
6837  //coordinate_rcp_holder will hold that data, and release it when MJ is deleted.
6838  ArrayRCP<const mj_scalar_t> * coordinate_ArrayRCP_holder;
6839 
6840  void set_up_partitioning_data(
6841  const RCP<PartitioningSolution<Adapter> >&solution);
6842 
6843  void set_input_parameters(const Teuchos::ParameterList &p);
6844 
6845  void free_work_memory();
6846 
6847  RCP<mj_partBoxVector_t> getGlobalBoxBoundaries() const;
6848 
6849  bool mj_premigrate_to_subset(int used_num_ranks, int migration_selection_option,
6850  RCP<const Environment> mj_env_,
6851  RCP<const Comm<int> > mj_problemComm_,
6852  int coord_dim_,
6853  mj_lno_t num_local_coords_,
6854  mj_gno_t num_global_coords_, size_t num_global_parts_,
6855  const mj_gno_t *initial_mj_gnos_,
6856  mj_scalar_t **mj_coordinates_,
6857  int num_weights_per_coord_,
6858  mj_scalar_t **mj_weights_,
6859  //results
6860  RCP<const Comm<int> > &result_problemComm_,
6861  mj_lno_t & result_num_local_coords_,
6862  mj_gno_t * &result_initial_mj_gnos_,
6863  mj_scalar_t ** &result_mj_coordinates_,
6864  mj_scalar_t ** &result_mj_weights_,
6865  int * &result_actual_owner_rank_);
6866 
6867 public:
6868 
6869  Zoltan2_AlgMJ(const RCP<const Environment> &env,
6870  RCP<const Comm<int> > &problemComm,
6871  const RCP<const coordinateModel_t> &coords) :
6872  mj_partitioner(), mj_env(env),
6873  mj_problemComm(problemComm),
6874  mj_coords(coords),
6875  imbalance_tolerance(0),
6876  num_global_parts(1),
6877  part_no_array(NULL),
6878  recursion_depth(0),
6879  coord_dim(0),
6880  num_local_coords(0),
6881  num_global_coords(0),
6882  initial_mj_gnos(NULL),
6883  mj_coordinates(NULL),
6884  num_weights_per_coord(0),
6885  mj_uniform_weights(NULL),
6886  mj_weights(NULL),
6887  mj_uniform_parts(NULL),
6888  mj_part_sizes(NULL),
6889  num_first_level_parts(1),
6890  first_level_distribution(NULL),
6891  distribute_points_on_cut_lines(true),
6892  max_concurrent_part_calculation(1),
6893  check_migrate_avoid_migration_option(0),
6894  migration_type(0),
6895  minimum_migration_imbalance(0.30),
6896  mj_keep_part_boxes(false),
6897  num_threads(1),
6898  mj_run_as_rcb(false),
6899  mj_premigration_option(0),
6900  min_coord_per_rank_for_premigration(32000),
6901  comXAdj_(), comAdj_(),
6902  coordinate_ArrayRCP_holder(NULL)
6903  {}
6904 
6906  if (coordinate_ArrayRCP_holder != NULL){
6907  delete [] this->coordinate_ArrayRCP_holder;
6908  this->coordinate_ArrayRCP_holder = NULL;
6909  }
6910  }
6911 
6914  static void getValidParameters(ParameterList & pl)
6915  {
6916  const bool bUnsorted = true; // this clarifies the flag is for unsrorted
6917  RCP<Zoltan2::IntegerRangeListValidator<int>> mj_parts_Validator =
6918  Teuchos::rcp( new Zoltan2::IntegerRangeListValidator<int>(bUnsorted) );
6919  pl.set("mj_parts", "0", "list of parts for multiJagged partitioning "
6920  "algorithm. As many as the dimension count.", mj_parts_Validator);
6921 
6922  pl.set("mj_concurrent_part_count", 1, "The number of parts whose cut "
6923  "coordinates will be calculated concurently.", Environment::getAnyIntValidator());
6924 
6925  pl.set("mj_minimum_migration_imbalance", 1.1,
6926  "mj_minimum_migration_imbalance, the minimum imbalance of the "
6927  "processors to avoid migration",
6929 
6930  RCP<Teuchos::EnhancedNumberValidator<int>> mj_migration_option_validator =
6931  Teuchos::rcp( new Teuchos::EnhancedNumberValidator<int>(0, 2) );
6932  pl.set("mj_migration_option", 1, "Migration option, 0 for decision "
6933  "depending on the imbalance, 1 for forcing migration, 2 for "
6934  "avoiding migration", mj_migration_option_validator);
6935 
6936 
6937 
6938 
6939  RCP<Teuchos::EnhancedNumberValidator<int>> mj_migration_type_validator =
6940  Teuchos::rcp( new Teuchos::EnhancedNumberValidator<int>(0, 1) );
6941  pl.set("mj_migration_type", 0, "Migration type, 0 for migration to minimize the imbalance "
6942  "1 for migration to minimize messages exchanged the migration." ,
6943  mj_migration_option_validator);
6944 
6945  // bool parameter
6946  pl.set("mj_keep_part_boxes", false, "Keep the part boundaries of the "
6947  "geometric partitioning.", Environment::getBoolValidator());
6948 
6949  // bool parameter
6950  pl.set("mj_enable_rcb", false, "Use MJ as RCB.",
6952 
6953  pl.set("mj_recursion_depth", -1, "Recursion depth for MJ: Must be "
6954  "greater than 0.", Environment::getAnyIntValidator());
6955 
6956  RCP<Teuchos::EnhancedNumberValidator<int>> mj_premigration_option_validator =
6957  Teuchos::rcp( new Teuchos::EnhancedNumberValidator<int>(0, 1024) );
6958 
6959  pl.set("mj_premigration_option", 0, "Whether to do premigration or not. 0 for no migration "
6960  "x > 0 for migration to consecutive processors, the subset will be 0,x,2x,3x,...subset ranks."
6961  , mj_premigration_option_validator);
6962 
6963  pl.set("mj_premigration_coordinate_count", 32000, "How many coordinate to assign each rank in multijagged after premigration"
6965 
6966  }
6967 
6974  void partition(const RCP<PartitioningSolution<Adapter> > &solution);
6975 
6976  mj_partBoxVector_t &getPartBoxesView() const
6977  {
6978  RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
6979  return *pBoxes;
6980  }
6981 
6982  mj_part_t pointAssign(int dim, adapter_scalar_t *point) const;
6983 
6984  void boxAssign(int dim, adapter_scalar_t *lower, adapter_scalar_t *upper,
6985  size_t &nPartsFound, mj_part_t **partsFound) const;
6986 
6987 
6990  void getCommunicationGraph(
6991  const PartitioningSolution<Adapter> *solution,
6992  ArrayRCP<mj_part_t> &comXAdj,
6993  ArrayRCP<mj_part_t> &comAdj);
6994 };
6995 
6996 
6997 
6998 
6999 template <typename Adapter>
7000 bool Zoltan2_AlgMJ<Adapter>::mj_premigrate_to_subset( int used_num_ranks,
7001  int /* migration_selection_option */,
7002  RCP<const Environment> mj_env_,
7003  RCP<const Comm<int> > mj_problemComm_,
7004  int coord_dim_,
7005  mj_lno_t num_local_coords_,
7006  mj_gno_t /* num_global_coords_ */, size_t /* num_global_parts_ */,
7007  const mj_gno_t *initial_mj_gnos_,
7008  mj_scalar_t **mj_coordinates_,
7009  int num_weights_per_coord_,
7010  mj_scalar_t **mj_weights_,
7011  //results
7012  RCP<const Comm<int> > &result_problemComm_,
7013  mj_lno_t &result_num_local_coords_,
7014  mj_gno_t * &result_initial_mj_gnos_,
7015  mj_scalar_t ** &result_mj_coordinates_,
7016  mj_scalar_t ** &result_mj_weights_,
7017  int * &result_actual_owner_rank_){
7018  mj_env_->timerStart(MACRO_TIMERS, "MultiJagged - PreMigration DistributorPlanCreating");
7019 
7020 
7021  int myRank = mj_problemComm_->getRank();
7022  int worldSize = mj_problemComm_->getSize();
7023 
7024  mj_part_t groupsize = worldSize / used_num_ranks;
7025 
7026  //std::cout << "used_num_ranks:" << used_num_ranks << " groupsize:" << groupsize << std::endl;
7027 
7028  std::vector<mj_part_t> group_begins(used_num_ranks + 1, 0);
7029 
7030  mj_part_t i_am_sending_to = 0;
7031  bool am_i_a_receiver = false;
7032 
7033  for(int i = 0; i < used_num_ranks; ++i){
7034  group_begins[i+ 1] = group_begins[i] + groupsize;
7035  if (worldSize % used_num_ranks > i) group_begins[i+ 1] += 1;
7036  if (i == used_num_ranks) group_begins[i+ 1] = worldSize;
7037  if (myRank >= group_begins[i] && myRank < group_begins[i + 1]) i_am_sending_to = group_begins[i];
7038  if (myRank == group_begins[i]) am_i_a_receiver= true;
7039  }
7040 
7041  ArrayView<const mj_part_t> idView(&(group_begins[0]), used_num_ranks );
7042  result_problemComm_ = mj_problemComm_->createSubcommunicator(idView);
7043 
7044 
7045  Tpetra::Distributor distributor(mj_problemComm_);
7046 
7047  std::vector<mj_part_t> coordinate_destinations(num_local_coords_, i_am_sending_to);
7048  ArrayView<const mj_part_t> destinations( &(coordinate_destinations[0]), num_local_coords_);
7049  mj_lno_t num_incoming_gnos = distributor.createFromSends(destinations);
7050  result_num_local_coords_ = num_incoming_gnos;
7051  mj_env_->timerStop(MACRO_TIMERS, "MultiJagged - PreMigration DistributorPlanCreating");
7052 
7053  mj_env_->timerStart(MACRO_TIMERS, "MultiJagged - PreMigration DistributorMigration");
7054 
7055  //migrate gnos.
7056  {
7057  ArrayRCP<mj_gno_t> received_gnos(num_incoming_gnos);
7058 
7059  ArrayView<const mj_gno_t> sent_gnos(initial_mj_gnos_, num_local_coords_);
7060  distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
7061 
7062  result_initial_mj_gnos_ = allocMemory<mj_gno_t>(num_incoming_gnos);
7063  memcpy(
7064  result_initial_mj_gnos_,
7065  received_gnos.getRawPtr(),
7066  num_incoming_gnos * sizeof(mj_gno_t));
7067  }
7068 
7069  //migrate coordinates
7070  result_mj_coordinates_ = allocMemory<mj_scalar_t *>(coord_dim_);
7071  for (int i = 0; i < coord_dim_; ++i){
7072  ArrayView<const mj_scalar_t> sent_coord(mj_coordinates_[i], num_local_coords_);
7073  ArrayRCP<mj_scalar_t> received_coord(num_incoming_gnos);
7074  distributor.doPostsAndWaits<mj_scalar_t>(sent_coord, 1, received_coord());
7075  result_mj_coordinates_[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
7076  memcpy(
7077  result_mj_coordinates_[i],
7078  received_coord.getRawPtr(),
7079  num_incoming_gnos * sizeof(mj_scalar_t));
7080  }
7081 
7082  result_mj_weights_ = allocMemory<mj_scalar_t *>(num_weights_per_coord_);
7083  //migrate weights.
7084  for (int i = 0; i < num_weights_per_coord_; ++i){
7085  ArrayView<const mj_scalar_t> sent_weight(mj_weights_[i], num_local_coords_);
7086  ArrayRCP<mj_scalar_t> received_weight(num_incoming_gnos);
7087  distributor.doPostsAndWaits<mj_scalar_t>(sent_weight, 1, received_weight());
7088  result_mj_weights_[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
7089  memcpy(
7090  result_mj_weights_[i],
7091  received_weight.getRawPtr(),
7092  num_incoming_gnos * sizeof(mj_scalar_t));
7093  }
7094 
7095  //migrate the owners of the coordinates
7096  {
7097  std::vector<int> owner_of_coordinate(num_local_coords_, myRank);
7098  ArrayView<int> sent_owners(&(owner_of_coordinate[0]), num_local_coords_);
7099  ArrayRCP<int> received_owners(num_incoming_gnos);
7100  distributor.doPostsAndWaits<int>(sent_owners, 1, received_owners());
7101  result_actual_owner_rank_ = allocMemory<int>(num_incoming_gnos);
7102  memcpy(
7103  result_actual_owner_rank_,
7104  received_owners.getRawPtr(),
7105  num_incoming_gnos * sizeof(int));
7106  }
7107  mj_env_->timerStop(MACRO_TIMERS, "MultiJagged - PreMigration DistributorMigration");
7108  return am_i_a_receiver;
7109 }
7110 
7111 
7112 
7113 
7114 
7115 
7116 
7126 template <typename Adapter>
7128  const RCP<PartitioningSolution<Adapter> > &solution
7129 )
7130 {
7131  this->set_up_partitioning_data(solution);
7132  this->set_input_parameters(this->mj_env->getParameters());
7133  if (this->mj_keep_part_boxes){
7134  this->mj_partitioner.set_to_keep_part_boxes();
7135  }
7136  this->mj_partitioner.set_partitioning_parameters(
7137  this->distribute_points_on_cut_lines,
7138  this->max_concurrent_part_calculation,
7139  this->check_migrate_avoid_migration_option,
7140  this->minimum_migration_imbalance, this->migration_type);
7141 
7142 
7143  RCP<const Comm<int> > result_problemComm = this->mj_problemComm;
7144  mj_lno_t result_num_local_coords = this->num_local_coords;
7145  mj_gno_t * result_initial_mj_gnos = NULL;
7146  mj_scalar_t **result_mj_coordinates = this->mj_coordinates;
7147  mj_scalar_t **result_mj_weights = this->mj_weights;
7148  int *result_actual_owner_rank = NULL;
7149  const mj_gno_t * result_initial_mj_gnos_ = this->initial_mj_gnos;
7150 
7151  //TODO: MD 08/2017: Further discussion is required.
7152  //MueLu calls MJ when it has very few coordinates per processors, such as 10.
7153  //For example, it begins with 1K processor with 1K coordinate in each.
7154  //Then with coarsening this reduces to 10 coordinate per procesor.
7155  //It calls MJ to repartition these to 10 coordinates.
7156  //MJ runs with 1K processor, 10 coordinate in each, and partitions to 10 parts.
7157  //As expected strong scaling is problem here, because computation is almost 0, and
7158  //communication cost of MJ linearly increases.
7159  //Premigration option gathers the coordinates to 10 parts before MJ starts
7160  //therefore MJ will run with a smalller subset of the problem.
7161  //Below, I am migrating the coordinates if mj_premigration_option is set,
7162  //and the result parts are less than the current part count, and the average number of
7163  //local coordinates is less than some threshold.
7164  //For example, premigration may not help if 1000 processors are partitioning data to 10,
7165  //but each of them already have 1M coordinate. In that case, we premigration would not help.
7166  int current_world_size = this->mj_problemComm->getSize();
7167  mj_lno_t threshold_num_local_coords = this->min_coord_per_rank_for_premigration;
7168  bool is_pre_migrated = false;
7169  bool am_i_in_subset = true;
7170  if ( mj_premigration_option > 0 &&
7171  size_t (current_world_size) > this->num_global_parts &&
7172  this->num_global_coords < mj_gno_t (current_world_size * threshold_num_local_coords)){
7173  if (this->mj_keep_part_boxes){
7174  throw std::logic_error("Multijagged: mj_keep_part_boxes and mj_premigration_option are not supported together yet.");
7175  }
7176  is_pre_migrated =true;
7177  int migration_selection_option = mj_premigration_option;
7178  if(migration_selection_option * this->num_global_parts > (size_t) (current_world_size)){
7179  migration_selection_option = current_world_size / this->num_global_parts;
7180  }
7181  int used_num_ranks = int (this->num_global_coords / float (threshold_num_local_coords) + 0.5);
7182  if (used_num_ranks == 0) used_num_ranks = 1;
7183 
7184  am_i_in_subset = this->mj_premigrate_to_subset(
7185  used_num_ranks,
7186  migration_selection_option,
7187  this->mj_env,
7188  this->mj_problemComm,
7189  this->coord_dim,
7190  this->num_local_coords,
7191  this->num_global_coords,
7192  this->num_global_parts,
7193  this->initial_mj_gnos,
7194  this->mj_coordinates,
7195  this->num_weights_per_coord,
7196  this->mj_weights,
7197  //results
7198  result_problemComm,
7199  result_num_local_coords,
7200  result_initial_mj_gnos,
7201  result_mj_coordinates,
7202  result_mj_weights,
7203  result_actual_owner_rank);
7204  result_initial_mj_gnos_ = result_initial_mj_gnos;
7205  }
7206 
7207 
7208 
7209  mj_part_t *result_assigned_part_ids = NULL;
7210  mj_gno_t *result_mj_gnos = NULL;
7211 
7212  if (am_i_in_subset){
7213  this->mj_partitioner.multi_jagged_part(
7214  this->mj_env,
7215  result_problemComm, //this->mj_problemComm,
7216 
7217  this->imbalance_tolerance,
7218  this->num_global_parts,
7219  this->part_no_array,
7220  this->recursion_depth,
7221 
7222  this->coord_dim,
7223  result_num_local_coords, //this->num_local_coords,
7224  this->num_global_coords,
7225  result_initial_mj_gnos_, //this->initial_mj_gnos,
7226  result_mj_coordinates, //this->mj_coordinates,
7227 
7228  this->num_weights_per_coord,
7229  this->mj_uniform_weights,
7230  result_mj_weights, //this->mj_weights,
7231  this->mj_uniform_parts,
7232  this->mj_part_sizes,
7233 
7234  result_assigned_part_ids,
7235  result_mj_gnos
7236  );
7237 
7238  }
7239 
7240  // Reorder results so that they match the order of the input
7241 
7242 #if defined(__cplusplus) && __cplusplus >= 201103L
7243  std::unordered_map<mj_gno_t, mj_lno_t> localGidToLid;
7244  localGidToLid.reserve(result_num_local_coords);
7245  for (mj_lno_t i = 0; i < result_num_local_coords; i++)
7246  localGidToLid[result_initial_mj_gnos_[i]] = i;
7247  ArrayRCP<mj_part_t> partId = arcp(new mj_part_t[result_num_local_coords],
7248  0, result_num_local_coords, true);
7249 
7250  for (mj_lno_t i = 0; i < result_num_local_coords; i++) {
7251  mj_lno_t origLID = localGidToLid[result_mj_gnos[i]];
7252  partId[origLID] = result_assigned_part_ids[i];
7253  }
7254 
7255 #else
7256  Teuchos::Hashtable<mj_gno_t, mj_lno_t>
7257  localGidToLid(result_num_local_coords);
7258  for (mj_lno_t i = 0; i < result_num_local_coords; i++)
7259  localGidToLid.put(result_initial_mj_gnos_[i], i);
7260 
7261  ArrayRCP<mj_part_t> partId = arcp(new mj_part_t[result_num_local_coords],
7262  0, result_num_local_coords, true);
7263 
7264  for (mj_lno_t i = 0; i < result_num_local_coords; i++) {
7265  mj_lno_t origLID = localGidToLid.get(result_mj_gnos[i]);
7266  partId[origLID] = result_assigned_part_ids[i];
7267  }
7268 
7269 #endif // C++11 is enabled
7270 
7271  delete [] result_mj_gnos;
7272  delete [] result_assigned_part_ids;
7273 
7274 
7275  //now the results are reordered. but if premigration occured,
7276  //then we need to send these ids to actual owners again.
7277  if (is_pre_migrated){
7278  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - PostMigration DistributorPlanCreating");
7279  Tpetra::Distributor distributor(this->mj_problemComm);
7280 
7281  ArrayView<const mj_part_t> actual_owner_destinations( result_actual_owner_rank , result_num_local_coords);
7282  mj_lno_t num_incoming_gnos = distributor.createFromSends(actual_owner_destinations);
7283  if (num_incoming_gnos != this->num_local_coords){
7284  throw std::logic_error("Zoltan2 - Multijagged Post Migration - num incoming is not equal to num local coords");
7285  }
7286  mj_env->timerStop(MACRO_TIMERS, "MultiJagged - PostMigration DistributorPlanCreating");
7287  mj_env->timerStart(MACRO_TIMERS, "MultiJagged - PostMigration DistributorMigration");
7288  ArrayRCP<mj_gno_t> received_gnos(num_incoming_gnos);
7289  ArrayRCP<mj_part_t> received_partids(num_incoming_gnos);
7290  {
7291  ArrayView<const mj_gno_t> sent_gnos(result_initial_mj_gnos_, result_num_local_coords);
7292  distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
7293  }
7294  {
7295  ArrayView<mj_part_t> sent_partnos(partId());
7296  distributor.doPostsAndWaits<mj_part_t>(sent_partnos, 1, received_partids());
7297  }
7298  partId = arcp(new mj_part_t[this->num_local_coords],
7299  0, this->num_local_coords, true);
7300 
7301  {
7302 #if defined(__cplusplus) && __cplusplus >= 201103L
7303  std::unordered_map<mj_gno_t, mj_lno_t> localGidToLid2;
7304  localGidToLid2.reserve(this->num_local_coords);
7305  for (mj_lno_t i = 0; i < this->num_local_coords; i++)
7306  localGidToLid2[this->initial_mj_gnos[i]] = i;
7307 
7308 
7309  for (mj_lno_t i = 0; i < this->num_local_coords; i++) {
7310  mj_lno_t origLID = localGidToLid2[received_gnos[i]];
7311  partId[origLID] = received_partids[i];
7312  }
7313 
7314 #else
7315  Teuchos::Hashtable<mj_gno_t, mj_lno_t>
7316  localGidToLid2(this->num_local_coords);
7317  for (mj_lno_t i = 0; i < this->num_local_coords; i++)
7318  localGidToLid2.put(this->initial_mj_gnos[i], i);
7319 
7320 
7321  for (mj_lno_t i = 0; i < this->num_local_coords; i++) {
7322  mj_lno_t origLID = localGidToLid2.get(received_gnos[i]);
7323  partId[origLID] = received_partids[i];
7324  }
7325 
7326 #endif // C++11 is enabled
7327 
7328  }
7329 
7330  {
7331  freeArray<mj_gno_t> (result_initial_mj_gnos);
7332  for (int i = 0; i < this->coord_dim; ++i){
7333  freeArray<mj_scalar_t> (result_mj_coordinates[i]);
7334  }
7335  freeArray<mj_scalar_t *> (result_mj_coordinates);
7336 
7337  for (int i = 0; i < this->num_weights_per_coord; ++i){
7338  freeArray<mj_scalar_t> (result_mj_weights[i]);
7339  }
7340  freeArray<mj_scalar_t *> (result_mj_weights);
7341  freeArray<int> (result_actual_owner_rank);
7342  }
7343  mj_env->timerStop(MACRO_TIMERS, "MultiJagged - PostMigration DistributorMigration");
7344 
7345  }
7346 
7347  solution->setParts(partId);
7348  this->free_work_memory();
7349 }
7350 
7351 /* \brief Freeing the memory allocated.
7352  * */
7353 template <typename Adapter>
7355  freeArray<mj_scalar_t *>(this->mj_coordinates);
7356  freeArray<mj_scalar_t *>(this->mj_weights);
7357  freeArray<bool>(this->mj_uniform_parts);
7358  freeArray<mj_scalar_t *>(this->mj_part_sizes);
7359  freeArray<bool>(this->mj_uniform_weights);
7360 
7361 }
7362 
7363 /* \brief Sets the partitioning data for multijagged algorithm.
7364  * */
7365 template <typename Adapter>
7366 void Zoltan2_AlgMJ<Adapter>::set_up_partitioning_data(
7367  const RCP<PartitioningSolution<Adapter> > &solution
7368 )
7369 {
7370  this->coord_dim = this->mj_coords->getCoordinateDim();
7371  this->num_weights_per_coord = this->mj_coords->getNumWeightsPerCoordinate();
7372  this->num_local_coords = this->mj_coords->getLocalNumCoordinates();
7373  this->num_global_coords = this->mj_coords->getGlobalNumCoordinates();
7374  int criteria_dim = (this->num_weights_per_coord ? this->num_weights_per_coord : 1);
7375 
7376  // From the Solution we get part information.
7377  // If the part sizes for a given criteria are not uniform,
7378  // then they are values that sum to 1.0.
7379  this->num_global_parts = solution->getTargetGlobalNumberOfParts();
7380  //allocate only two dimensional pointer.
7381  //raw pointer addresess will be obtained from multivector.
7382  this->mj_coordinates = allocMemory<mj_scalar_t *>(this->coord_dim);
7383  this->mj_weights = allocMemory<mj_scalar_t *>(criteria_dim);
7384 
7385  //if the partitioning results are to be uniform.
7386  this->mj_uniform_parts = allocMemory< bool >(criteria_dim);
7387  //if in a criteria dimension, uniform part is false this shows ratios of
7388  //the target part weights.
7389  this->mj_part_sizes = allocMemory<mj_scalar_t *>(criteria_dim);
7390  //if the weights of coordinates are uniform in a criteria dimension.
7391  this->mj_uniform_weights = allocMemory< bool >(criteria_dim);
7392 
7393  typedef StridedData<mj_lno_t, adapter_scalar_t> input_t;
7394  ArrayView<const mj_gno_t> gnos;
7395  ArrayView<input_t> xyz;
7396  ArrayView<input_t> wgts;
7397 
7398 
7399  this->coordinate_ArrayRCP_holder = new ArrayRCP<const mj_scalar_t> [this->coord_dim + this->num_weights_per_coord];
7400 
7401  this->mj_coords->getCoordinates(gnos, xyz, wgts);
7402  //obtain global ids.
7403  ArrayView<const mj_gno_t> mj_gnos = gnos;
7404  this->initial_mj_gnos = mj_gnos.getRawPtr();
7405 
7406  //extract coordinates from multivector.
7407  for (int dim=0; dim < this->coord_dim; dim++){
7408  ArrayRCP<const mj_scalar_t> ar;
7409  xyz[dim].getInputArray(ar); // will copy if stride != 1 or
7410  // adapter_scalar_t != mj_scalar_t
7411  this->coordinate_ArrayRCP_holder[dim] = ar;
7412 
7413  //multiJagged coordinate values assignment
7414  this->mj_coordinates[dim] = (mj_scalar_t *)ar.getRawPtr();
7415  }
7416 
7417  //if no weights are provided set uniform weight.
7418  if (this->num_weights_per_coord == 0){
7419  this->mj_uniform_weights[0] = true;
7420  this->mj_weights[0] = NULL;
7421  }
7422  else{
7423  //if weights are provided get weights for all weight indices
7424  for (int wdim = 0; wdim < this->num_weights_per_coord; wdim++){
7425  ArrayRCP<const mj_scalar_t> ar;
7426  wgts[wdim].getInputArray(ar); // will copy if stride!=1
7427  // or adapter_scalar_t !=
7428  // mj_scalar_t
7429  this->coordinate_ArrayRCP_holder[this->coord_dim + wdim] = ar;
7430  this->mj_uniform_weights[wdim] = false;
7431  this->mj_weights[wdim] = (mj_scalar_t *) ar.getRawPtr();
7432  }
7433  }
7434 
7435  for (int wdim = 0; wdim < criteria_dim; wdim++){
7436  if (solution->criteriaHasUniformPartSizes(wdim)){
7437  this->mj_uniform_parts[wdim] = true;
7438  this->mj_part_sizes[wdim] = NULL;
7439  }
7440  else{
7441  std::cerr << "MJ does not support non uniform target part weights" << std::endl;
7442  exit(1);
7443  }
7444  }
7445 }
7446 
7447 /* \brief Sets the partitioning parameters for multijagged algorithm.
7448  * \param pl: is the parameter list provided to zoltan2 call
7449  * */
7450 template <typename Adapter>
7451 void Zoltan2_AlgMJ<Adapter>::set_input_parameters(const Teuchos::ParameterList &pl){
7452 
7453  const Teuchos::ParameterEntry *pe = pl.getEntryPtr("imbalance_tolerance");
7454  if (pe){
7455  double tol;
7456  tol = pe->getValue(&tol);
7457  this->imbalance_tolerance = tol - 1.0;
7458  }
7459 
7460  // TODO: May be a more relaxed tolerance is needed. RCB uses 10%
7461  if (this->imbalance_tolerance <= 0)
7462  this->imbalance_tolerance= 10e-4;
7463 
7464  //if an input partitioning array is provided.
7465  this->part_no_array = NULL;
7466  //the length of the input partitioning array.
7467  this->recursion_depth = 0;
7468 
7469  if (pl.getPtr<Array <mj_part_t> >("mj_parts")){
7470  this->part_no_array = (mj_part_t *) pl.getPtr<Array <mj_part_t> >("mj_parts")->getRawPtr();
7471  this->recursion_depth = pl.getPtr<Array <mj_part_t> >("mj_parts")->size() - 1;
7472  this->mj_env->debug(2, "mj_parts provided by user");
7473  }
7474 
7475  //get mj specific parameters.
7476  this->distribute_points_on_cut_lines = true;
7477  this->max_concurrent_part_calculation = 1;
7478 
7479  this->mj_run_as_rcb = false;
7480  this->mj_premigration_option = 0;
7481  this->min_coord_per_rank_for_premigration = 32000;
7482 
7483  int mj_user_recursion_depth = -1;
7484  this->mj_keep_part_boxes = false;
7485  this->check_migrate_avoid_migration_option = 0;
7486  this->migration_type = 0;
7487  this->minimum_migration_imbalance = 0.35;
7488 
7489  pe = pl.getEntryPtr("mj_minimum_migration_imbalance");
7490  if (pe){
7491  double imb;
7492  imb = pe->getValue(&imb);
7493  this->minimum_migration_imbalance = imb - 1.0;
7494  }
7495 
7496  pe = pl.getEntryPtr("mj_migration_option");
7497  if (pe){
7498  this->check_migrate_avoid_migration_option = pe->getValue(&this->check_migrate_avoid_migration_option);
7499  }else {
7500  this->check_migrate_avoid_migration_option = 0;
7501  }
7502  if (this->check_migrate_avoid_migration_option > 1) this->check_migrate_avoid_migration_option = -1;
7503 
7505  pe = pl.getEntryPtr("mj_migration_type");
7506  if (pe){
7507  this->migration_type = pe->getValue(&this->migration_type);
7508  }else {
7509  this->migration_type = 0;
7510  }
7511  //std::cout << " this->migration_type:" << this->migration_type << std::endl;
7513 
7514  pe = pl.getEntryPtr("mj_concurrent_part_count");
7515  if (pe){
7516  this->max_concurrent_part_calculation = pe->getValue(&this->max_concurrent_part_calculation);
7517  }else {
7518  this->max_concurrent_part_calculation = 1; // Set to 1 if not provided.
7519  }
7520 
7521  pe = pl.getEntryPtr("mj_keep_part_boxes");
7522  if (pe){
7523  this->mj_keep_part_boxes = pe->getValue(&this->mj_keep_part_boxes);
7524  }else {
7525  this->mj_keep_part_boxes = false; // Set to invalid value
7526  }
7527 
7528 
7529  // For now, need keep_part_boxes to do pointAssign and boxAssign.
7530  // pe = pl.getEntryPtr("keep_cuts");
7531  // if (pe){
7532  // int tmp = pe->getValue(&tmp);
7533  // if (tmp) this->mj_keep_part_boxes = true;
7534  // }
7535 
7536  //need to keep part boxes if mapping type is geometric.
7537  if (this->mj_keep_part_boxes == false){
7538  pe = pl.getEntryPtr("mapping_type");
7539  if (pe){
7540  int mapping_type = -1;
7541  mapping_type = pe->getValue(&mapping_type);
7542  if (mapping_type == 0){
7543  mj_keep_part_boxes = true;
7544  }
7545  }
7546  }
7547 
7548  //need to keep part boxes if mapping type is geometric.
7549  pe = pl.getEntryPtr("mj_enable_rcb");
7550  if (pe){
7551  this->mj_run_as_rcb = pe->getValue(&this->mj_run_as_rcb);
7552  }else {
7553  this->mj_run_as_rcb = false; // Set to invalid value
7554  }
7555 
7556  pe = pl.getEntryPtr("mj_premigration_option");
7557  if (pe){
7558  mj_premigration_option = pe->getValue(&mj_premigration_option);
7559  }else {
7560  mj_premigration_option = 0;
7561  }
7562 
7563  pe = pl.getEntryPtr("mj_premigration_coordinate_count");
7564  if (pe){
7565  min_coord_per_rank_for_premigration = pe->getValue(&min_coord_per_rank_for_premigration);
7566  }else {
7567  min_coord_per_rank_for_premigration = 32000;
7568  }
7569 
7570  pe = pl.getEntryPtr("mj_recursion_depth");
7571  if (pe){
7572  mj_user_recursion_depth = pe->getValue(&mj_user_recursion_depth);
7573  }else {
7574  mj_user_recursion_depth = -1; // Set to invalid value
7575  }
7576 
7577  bool val = false;
7578  pe = pl.getEntryPtr("rectilinear");
7579  if (pe) val = pe->getValue(&val);
7580  if (val){
7581  this->distribute_points_on_cut_lines = false;
7582  } else {
7583  this->distribute_points_on_cut_lines = true;
7584  }
7585 
7586  if (this->mj_run_as_rcb){
7587  mj_user_recursion_depth = (int)(ceil(log ((this->num_global_parts)) / log (2.0)));
7588  }
7589  if (this->recursion_depth < 1){
7590  if (mj_user_recursion_depth > 0){
7591  this->recursion_depth = mj_user_recursion_depth;
7592  }
7593  else {
7594  this->recursion_depth = this->coord_dim;
7595  }
7596  }
7597 
7598  this->num_threads = 1;
7599 #ifdef HAVE_ZOLTAN2_OMP
7600 #pragma omp parallel
7601  {
7602  this->num_threads = omp_get_num_threads();
7603  }
7604 #endif
7605 
7606 }
7607 
7609 template <typename Adapter>
7611  int dim,
7612  adapter_scalar_t *lower,
7613  adapter_scalar_t *upper,
7614  size_t &nPartsFound,
7615  typename Adapter::part_t **partsFound) const
7616 {
7617  // TODO: Implement with cuts rather than boxes to reduce algorithmic
7618  // TODO: complexity. Or at least do a search through the boxes, using
7619  // TODO: p x q x r x ... if possible.
7620 
7621  nPartsFound = 0;
7622  *partsFound = NULL;
7623 
7624  if (this->mj_keep_part_boxes) {
7625 
7626  // Get vector of part boxes
7627  RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
7628 
7629  size_t nBoxes = (*partBoxes).size();
7630  if (nBoxes == 0) {
7631  throw std::logic_error("no part boxes exist");
7632  }
7633 
7634  // Determine whether the box overlaps the globalBox at all
7635  RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
7636 
7637  if (globalBox->boxesOverlap(dim, lower, upper)) {
7638 
7639  std::vector<typename Adapter::part_t> partlist;
7640 
7641  // box overlaps the global box; find specific overlapping boxes
7642  for (size_t i = 0; i < nBoxes; i++) {
7643  try {
7644  if ((*partBoxes)[i].boxesOverlap(dim, lower, upper)) {
7645  nPartsFound++;
7646  partlist.push_back((*partBoxes)[i].getpId());
7647 
7648 // std::cout << "Given box (";
7649 // for (int j = 0; j < dim; j++)
7650 // std::cout << lower[j] << " ";
7651 // std::cout << ") x (";
7652 // for (int j = 0; j < dim; j++)
7653 // std::cout << upper[j] << " ";
7654 // std::cout << ") overlaps PartBox "
7655 // << (*partBoxes)[i].getpId() << " (";
7656 // for (int j = 0; j < dim; j++)
7657 // std::cout << (*partBoxes)[i].getlmins()[j] << " ";
7658 // std::cout << ") x (";
7659 // for (int j = 0; j < dim; j++)
7660 // std::cout << (*partBoxes)[i].getlmaxs()[j] << " ";
7661 // std::cout << ")" << std::endl;
7662  }
7663  }
7665  }
7666  if (nPartsFound) {
7667  *partsFound = new mj_part_t[nPartsFound];
7668  for (size_t i = 0; i < nPartsFound; i++)
7669  (*partsFound)[i] = partlist[i];
7670  }
7671  }
7672  else {
7673  // Box does not overlap the domain at all. Find the closest part
7674  // Not sure how to perform this operation for MJ without having the
7675  // cuts. With the RCB cuts, the concept of a part extending to
7676  // infinity was natural. With the boxes, it is much more difficult.
7677  // TODO: For now, return information indicating NO OVERLAP.
7678 
7679  }
7680  }
7681  else {
7682  throw std::logic_error("need to use keep_cuts parameter for boxAssign");
7683  }
7684 }
7685 
7687 template <typename Adapter>
7689  int dim,
7690  adapter_scalar_t *point) const
7691 {
7692 
7693  // TODO: Implement with cuts rather than boxes to reduce algorithmic
7694  // TODO: complexity. Or at least do a search through the boxes, using
7695  // TODO: p x q x r x ... if possible.
7696 
7697  if (this->mj_keep_part_boxes) {
7698  typename Adapter::part_t foundPart = -1;
7699 
7700  // Get vector of part boxes
7701  RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
7702 
7703  size_t nBoxes = (*partBoxes).size();
7704  if (nBoxes == 0) {
7705  throw std::logic_error("no part boxes exist");
7706  }
7707 
7708  // Determine whether the point is within the global domain
7709  RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
7710 
7711  if (globalBox->pointInBox(dim, point)) {
7712 
7713  // point is in the global domain; determine in which part it is.
7714  size_t i;
7715  for (i = 0; i < nBoxes; i++) {
7716  try {
7717  if ((*partBoxes)[i].pointInBox(dim, point)) {
7718  foundPart = (*partBoxes)[i].getpId();
7719 // std::cout << "Point (";
7720 // for (int j = 0; j < dim; j++) std::cout << point[j] << " ";
7721 // std::cout << ") found in box " << i << " part " << foundPart
7722 // << std::endl;
7723 // (*partBoxes)[i].print();
7724  break;
7725  }
7726  }
7728  }
7729 
7730  if (i == nBoxes) {
7731  // This error should never occur
7732  std::ostringstream oss;
7733  oss << "Point (";
7734  for (int j = 0; j < dim; j++) oss << point[j] << " ";
7735  oss << ") not found in domain";
7736  throw std::logic_error(oss.str());
7737  }
7738  }
7739 
7740  else {
7741  // Point is outside the global domain.
7742  // Determine to which part it is closest.
7743  // TODO: with cuts, would not need this special case
7744 
7745  typedef typename Zoltan2::coordinateModelPartBox::coord_t coord_t;
7746  size_t closestBox = 0;
7747  coord_t minDistance = std::numeric_limits<coord_t>::max();
7748  coord_t *centroid = new coord_t[dim];
7749  for (size_t i = 0; i < nBoxes; i++) {
7750  (*partBoxes)[i].computeCentroid(centroid);
7751  coord_t sum = 0.;
7752  coord_t diff;
7753  for (int j = 0; j < dim; j++) {
7754  diff = centroid[j] - point[j];
7755  sum += diff * diff;
7756  }
7757  if (sum < minDistance) {
7758  minDistance = sum;
7759  closestBox = i;
7760  }
7761  }
7762  foundPart = (*partBoxes)[closestBox].getpId();
7763  delete [] centroid;
7764  }
7765 
7766  return foundPart;
7767  }
7768  else {
7769  throw std::logic_error("need to use keep_cuts parameter for pointAssign");
7770  }
7771 }
7772 
7773 template <typename Adapter>
7775  const PartitioningSolution<Adapter> * /* solution */,
7776  ArrayRCP<typename Zoltan2_AlgMJ<Adapter>::mj_part_t> &comXAdj,
7777  ArrayRCP<typename Zoltan2_AlgMJ<Adapter>::mj_part_t> &comAdj)
7778 {
7779  if(comXAdj_.getRawPtr() == NULL && comAdj_.getRawPtr() == NULL){
7780  RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
7781  mj_part_t ntasks = (*pBoxes).size();
7782  int dim = (*pBoxes)[0].getDim();
7783  GridHash grid(pBoxes, ntasks, dim);
7784  grid.getAdjArrays(comXAdj_, comAdj_);
7785  }
7786  comAdj = comAdj_;
7787  comXAdj = comXAdj_;
7788 }
7789 
7790 
7791 template <typename Adapter>
7792 RCP<typename Zoltan2_AlgMJ<Adapter>::mj_partBoxVector_t>
7794 {
7795  return this->mj_partitioner.get_kept_boxes();
7796 }
7797 
7798 
7799 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
7800  typename mj_part_t>
7801 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::mj_partBoxVector_t>
7803 {
7804  if (this->mj_keep_part_boxes)
7805  return this->kept_boxes;
7806  else
7807  throw std::logic_error("Error: part boxes are not stored.");
7808 }
7809 
7810 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
7811  typename mj_part_t>
7812 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::mj_partBoxVector_t>
7814  RCP<mj_partBoxVector_t> &localPartBoxes
7815 ) const
7816 {
7817  typedef typename Zoltan2::coordinateModelPartBox::coord_t coord_t;
7818  mj_part_t ntasks = this->num_global_parts;
7819  int dim = (*localPartBoxes)[0].getDim();
7820  coord_t *localPartBoundaries = new coord_t[ntasks * 2 *dim];
7821 
7822  memset(localPartBoundaries, 0, sizeof(coord_t) * ntasks * 2 *dim);
7823 
7824  coord_t *globalPartBoundaries = new coord_t[ntasks * 2 *dim];
7825  memset(globalPartBoundaries, 0, sizeof(coord_t) * ntasks * 2 *dim);
7826 
7827  coord_t *localPartMins = localPartBoundaries;
7828  coord_t *localPartMaxs = localPartBoundaries + ntasks * dim;
7829 
7830  coord_t *globalPartMins = globalPartBoundaries;
7831  coord_t *globalPartMaxs = globalPartBoundaries + ntasks * dim;
7832 
7833  mj_part_t boxCount = localPartBoxes->size();
7834  for (mj_part_t i = 0; i < boxCount; ++i){
7835  mj_part_t pId = (*localPartBoxes)[i].getpId();
7836  //std::cout << "me:" << comm->getRank() << " has:" << pId << std::endl;
7837 
7838  coord_t *lmins = (*localPartBoxes)[i].getlmins();
7839  coord_t *lmaxs = (*localPartBoxes)[i].getlmaxs();
7840 
7841  for (int j = 0; j < dim; ++j){
7842  localPartMins[dim * pId + j] = lmins[j];
7843  localPartMaxs[dim * pId + j] = lmaxs[j];
7844  /*
7845  std::cout << "me:" << comm->getRank() <<
7846  " dim * pId + j:"<< dim * pId + j <<
7847  " localMin:" << localPartMins[dim * pId + j] <<
7848  " localMax:" << localPartMaxs[dim * pId + j] << std::endl;
7849  */
7850  }
7851  }
7852 
7853  Teuchos::Zoltan2_BoxBoundaries<int, coord_t> reductionOp(ntasks * 2 *dim);
7854 
7855  reduceAll<int, coord_t>(*mj_problemComm, reductionOp,
7856  ntasks * 2 *dim, localPartBoundaries, globalPartBoundaries);
7857  RCP<mj_partBoxVector_t> pB(new mj_partBoxVector_t(),true);
7858  for (mj_part_t i = 0; i < ntasks; ++i){
7859  Zoltan2::coordinateModelPartBox tpb(i, dim, globalPartMins + dim * i,
7860  globalPartMaxs + dim * i);
7861 
7862  /*
7863  for (int j = 0; j < dim; ++j){
7864  std::cout << "me:" << comm->getRank() <<
7865  " dim * pId + j:"<< dim * i + j <<
7866  " globalMin:" << globalPartMins[dim * i + j] <<
7867  " globalMax:" << globalPartMaxs[dim * i + j] << std::endl;
7868  }
7869  */
7870  pB->push_back(tpb);
7871  }
7872  delete []localPartBoundaries;
7873  delete []globalPartBoundaries;
7874  //RCP <mj_partBoxVector_t> tmpRCPBox(pB, true);
7875  return pB;
7876 }
7877 } // namespace Zoltan2
7878 
7879 #endif
#define MIN_WORK_LAST_DIM
GridHash Class, Hashing Class for part boxes.
Time an algorithm (or other entity) as a whole.
void set(IT index_, CT count_, WT *vals_)
#define Z2_FORWARD_EXCEPTIONS
Forward an exception back through call stack.
Defines Parameter related enumerators, declares functions.
AlgMJ()
Multi Jagged coordinate partitioning algorithm default constructor.
static RCP< Teuchos::BoolParameterEntryValidator > getBoolValidator()
Exists to make setting up validators less cluttered.
void getAdjArrays(ArrayRCP< part_t > &comXAdj_, ArrayRCP< part_t > &comAdj_)
GridHash Class, returns the adj arrays.
Zoltan2_BoxBoundaries(Ordinal s_)
Constructor.
void sequential_task_partitioning(const RCP< const Environment > &env, mj_lno_t num_total_coords, mj_lno_t num_selected_coords, size_t num_target_part, int coord_dim, mj_scalar_t **mj_coordinates, mj_lno_t *initial_selected_coords_output_permutation, mj_lno_t *output_xadj, int recursion_depth, const mj_part_t *part_no_array, bool partition_along_longest_dim, int num_ranks_per_node, bool divide_to_prime_first_, mj_part_t num_first_level_parts_=1, const mj_part_t *first_level_distribution_=NULL)
Special function for partitioning for task mapping. Runs sequential, and performs deterministic parti...
Sort items for quick sort function.
void uqSignsort(IT n, uSignedSortItem< IT, WT, SIGN > *arr)
Quick sort function. Sorts the arr of uSignedSortItems, with respect to increasing vals...
#define imbalanceOf2(Wachieved, wExpected)
RCP< mj_partBoxVector_t > get_kept_boxes() const
void freeArray(T *&array)
Frees the given array.
Class for sorting items with multiple values. First sorting with respect to val[0], then val[1] then ... val[count-1]. The last tie breaking is done with index values. Used for task mapping partitioning where the points on a cut line needs to be distributed consistently.
static ArrayRCP< ArrayRCP< zscalar_t > > weights
void multi_jagged_part(const RCP< const Environment > &env, RCP< const Comm< int > > &problemComm, double imbalance_tolerance, size_t num_global_parts, mj_part_t *part_no_array, int recursion_depth, int coord_dim, mj_lno_t num_local_coords, mj_gno_t num_global_coords, const mj_gno_t *initial_mj_gnos, mj_scalar_t **mj_coordinates, int num_weights_per_coord, bool *mj_uniform_weights, mj_scalar_t **mj_weights, bool *mj_uniform_parts, mj_scalar_t **mj_part_sizes, mj_part_t *&result_assigned_part_ids, mj_gno_t *&result_mj_gnos)
Multi Jagged coordinate partitioning algorithm.
static RCP< Teuchos::AnyNumberParameterEntryValidator > getAnyDoubleValidator()
Exists to make setting up validators less cluttered.
void partition(const RCP< PartitioningSolution< Adapter > > &solution)
Multi Jagged coordinate partitioning algorithm.
void reduce(const Ordinal count, const T inBuffer[], T inoutBuffer[]) const
Implement Teuchos::ValueTypeReductionOp interface.
#define SIGNIFICANCE_MUL
coordinateModelPartBox Class, represents the boundaries of the box which is a result of a geometric p...
A ParameterList validator for integer range lists.
void set_to_keep_part_boxes()
Function call, if the part boxes are intended to be kept.
void getCommunicationGraph(const PartitioningSolution< Adapter > *solution, ArrayRCP< mj_part_t > &comXAdj, ArrayRCP< mj_part_t > &comAdj)
returns communication graph resulting from MJ partitioning.
bool operator>=(const uSignedSortItem< IT, WT, SIGN > &rhs)
SparseMatrixAdapter_t::part_t part_t
#define FUTURE_REDUCEALL_CUTOFF
Multi Jagged coordinate partitioning algorithm.
This class provides geometric coordinates with optional weights to the Zoltan2 algorithm.
void boxAssign(int dim, adapter_scalar_t *lower, adapter_scalar_t *upper, size_t &nPartsFound, mj_part_t **partsFound) const
T * allocMemory(size_t size)
Allocates memory for the given size.
dictionary vals
Definition: xml2dox.py:186
uMultiSortItem< IT, CT, WT > operator=(const uMultiSortItem< IT, CT, WT > &other)
A PartitioningSolution is a solution to a partitioning problem.
Zoltan2_BoxBoundaries()
Default Constructor.
#define ZOLTAN2_ABS(x)
tuple root
Definition: xml2dox.py:168
RCP< mj_partBoxVector_t > compute_global_box_boundaries(RCP< mj_partBoxVector_t > &localPartBoxes) const
uMultiSortItem(const uMultiSortItem< IT, CT, WT > &other)
#define LEAST_SIGNIFICANCE
Zoltan2_BoxBoundaries is a reduction operation to all reduce the all box boundaries.
Algorithm defines the base class for all algorithms.
mj_partBoxVector_t & getPartBoxesView() const
for partitioning methods, return bounding boxes of the
#define Z2_ASSERT_VALUE(actual, expected)
Throw an error when actual value is not equal to expected value.
static RCP< Teuchos::AnyNumberParameterEntryValidator > getAnyIntValidator()
Exists to make setting up validators less cluttered.
#define epsilon
RCP< mj_partBox_t > get_global_box() const
Return the global bounding box: min/max coords of global domain.
uMultiSortItem(IT index_, CT count_, WT *vals_)
Define IntegerRangeList validator.
Defines the CoordinateModel classes.
bool operator>(const uSignedSortItem< IT, WT, SIGN > &rhs) const
bool operator>(const uMultiSortItem< IT, CT, WT > &other) const
Tpetra::global_size_t global_size_t
mj_part_t pointAssign(int dim, adapter_scalar_t *point) const
#define Z2_THROW_OUTSIDE_ERROR(env)
Throw an error returned from outside the Zoltan2 library.
#define ZOLTAN2_ALGMULTIJAGGED_SWAP(a, b, temp)
void set_partitioning_parameters(bool distribute_points_on_cut_lines_, int max_concurrent_part_calculation_, int check_migrate_avoid_migration_option_, double minimum_migration_imbalance_, int migration_type_=0)
Multi Jagged coordinate partitioning algorithm.
A gathering of useful namespace methods.
void uqsort(IT n, uSortItem< IT, WT > *arr)
Quick sort function. Sorts the arr of uSortItems, with respect to increasing vals.
Zoltan2_AlgMJ(const RCP< const Environment > &env, RCP< const Comm< int > > &problemComm, const RCP< const coordinateModel_t > &coords)
Contains Teuchos redcution operators for the Multi-jagged algorthm.
Multi Jagged coordinate partitioning algorithm.
static void getValidParameters(ParameterList &pl)
Set up validators specific to this algorithm.