49 #ifndef _ZOLTAN2_ALGMultiJagged_HPP_
50 #define _ZOLTAN2_ALGMultiJagged_HPP_
57 #include <Teuchos_StandardParameterEntryValidators.hpp>
59 #include <Tpetra_Distributor.hpp>
60 #include <Teuchos_ParameterList.hpp>
67 #if defined(__cplusplus) && __cplusplus >= 201103L
68 #include <unordered_map>
70 #include <Teuchos_Hashtable.hpp>
71 #endif // C++11 is enabled
73 #ifdef ZOLTAN2_USEZOLTANCOMM
74 #ifdef HAVE_ZOLTAN2_MPI
75 #define ENABLE_ZOLTAN_MIGRATION
76 #include "zoltan_comm_cpp.h"
77 #include "zoltan_types.h"
81 #ifdef HAVE_ZOLTAN2_OMP
85 #define LEAST_SIGNIFICANCE 0.0001
86 #define SIGNIFICANCE_MUL 1000
91 #define FUTURE_REDUCEALL_CUTOFF 1500000
94 #define MIN_WORK_LAST_DIM 1000
99 #define ZOLTAN2_ABS(x) ((x) >= 0 ? (x) : -(x))
101 #define imbalanceOf(Wachieved, totalW, expectedRatio) \
102 (Wachieved) / ((totalW) * (expectedRatio)) - 1
103 #define imbalanceOf2(Wachieved, wExpected) \
104 (Wachieved) / (wExpected) - 1
107 #define ZOLTAN2_ALGMULTIJAGGED_SWAP(a,b,temp) temp=(a);(a)=(b);(b)=temp;
116 template <
typename Ordinal,
typename T>
135 size(s_), _EPSILON (std::numeric_limits<T>::
epsilon()){}
139 void reduce(
const Ordinal count,
const T inBuffer[], T inoutBuffer[])
const
141 for (Ordinal i=0; i < count; i++){
142 if (
Z2_ABS(inBuffer[i]) > _EPSILON){
143 inoutBuffer[i] = inBuffer[i];
155 template <
typename T>
160 throw "cannot allocate memory";
172 template <
typename T>
188 template <
typename IT,
typename CT,
typename WT>
209 this->
index = index_;
210 this->
count = count_;
226 void set(IT index_ ,CT count_, WT *vals_){
227 this->
index = index_;
228 this->
count = count_;
240 bool operator<(const uMultiSortItem<IT,CT,WT>& other)
const{
241 assert (this->
count == other.count);
242 for(CT i = 0; i < this->
count; ++i){
248 if(this->
val[i] < other.val[i]){
257 return this->
index < other.index;
261 for(CT i = 0; i < this->
count; ++i){
267 if(this->
val[i] > other.
val[i]){
284 template <
class IT,
class WT>
295 template <
class IT,
class WT>
301 IT i, ir=n, j, k, l=1;
302 IT jstack=0, istack[50];
311 for (j=l+1;j<=ir;j++)
317 if (arr[i].val <= aval)
333 if (arr[l+1].val > arr[ir].val)
337 if (arr[l].val > arr[ir].val)
341 if (arr[l+1].val > arr[l].val)
351 do i++;
while (arr[i].val < aval);
352 do j--;
while (arr[j].val > aval);
359 if (jstack > NSTACK){
360 std::cout <<
"uqsort: NSTACK too small in sort." << std::endl;
379 template <
class IT,
class WT,
class SIGN>
386 bool operator<(const uSignedSortItem<IT, WT, SIGN>& rhs)
const {
388 if (this->
signbit < rhs.signbit){
392 else if (this->
signbit == rhs.signbit){
394 if (this->
val < rhs.val){
398 else if (this->
val > rhs.val){
424 else if (this->
val > rhs.
val){
437 bool operator<=(const uSignedSortItem<IT, WT, SIGN>& rhs){
438 return !(*
this > rhs);}
440 return !(*
this < rhs);}
446 template <
class IT,
class WT,
class SIGN>
451 IT i, ir=n, j, k, l=1;
452 IT jstack=0, istack[50];
460 for (j=l+1;j<=ir;j++)
482 if (arr[l+1] > arr[ir])
486 if (arr[l] > arr[ir])
490 if (arr[l+1] > arr[l])
499 do i++;
while (arr[i] < a);
500 do j--;
while (arr[j] > a);
507 if (jstack > NSTACK){
508 std::cout <<
"uqsort: NSTACK too small in sort." << std::endl;
530 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
536 typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
538 RCP<const Environment> mj_env;
539 RCP<const Comm<int> > mj_problemComm;
541 double imbalance_tolerance;
542 mj_part_t *part_no_array;
544 int coord_dim, num_weights_per_coord;
546 size_t initial_num_loc_coords;
549 mj_lno_t num_local_coords;
550 mj_gno_t num_global_coords;
552 mj_scalar_t **mj_coordinates;
553 mj_scalar_t **mj_weights;
554 bool *mj_uniform_parts;
555 mj_scalar_t **mj_part_sizes;
556 bool *mj_uniform_weights;
558 ArrayView<const mj_gno_t> mj_gnos;
559 size_t num_global_parts;
561 mj_gno_t *initial_mj_gnos;
562 mj_gno_t *current_mj_gnos;
563 int *owner_of_coordinate;
565 mj_lno_t *coordinate_permutations;
566 mj_lno_t *new_coordinate_permutations;
567 mj_part_t *assigned_part_ids;
570 mj_lno_t *new_part_xadj;
573 bool distribute_points_on_cut_lines;
574 mj_part_t max_concurrent_part_calculation;
577 int mj_user_recursion_depth;
578 bool mj_keep_part_boxes;
580 int check_migrate_avoid_migration_option;
583 double minimum_migration_imbalance;
597 mj_part_t num_first_level_parts;
598 const mj_part_t *first_level_distribution;
600 mj_part_t total_num_cut ;
601 mj_part_t total_num_part;
603 mj_part_t max_num_part_along_dim ;
604 mj_part_t max_num_cut_along_dim;
605 size_t max_num_total_part_along_dim;
607 mj_part_t total_dim_num_reduce_all;
608 mj_part_t last_dim_num_part;
612 RCP<Comm<int> > comm;
614 mj_scalar_t sEpsilon;
616 mj_scalar_t maxScalar_t;
617 mj_scalar_t minScalar_t;
619 mj_scalar_t *all_cut_coordinates;
620 mj_scalar_t *max_min_coords;
621 mj_scalar_t *process_cut_line_weight_to_put_left;
622 mj_scalar_t **thread_cut_line_weight_to_put_left;
628 mj_scalar_t *cut_coordinates_work_array;
631 mj_scalar_t *target_part_weights;
633 mj_scalar_t *cut_upper_bound_coordinates ;
634 mj_scalar_t *cut_lower_bound_coordinates ;
635 mj_scalar_t *cut_lower_bound_weights ;
636 mj_scalar_t *cut_upper_bound_weights ;
638 mj_scalar_t *process_local_min_max_coord_total_weight ;
639 mj_scalar_t *global_min_max_coord_total_weight ;
643 bool *is_cut_line_determined;
646 mj_part_t *my_incomplete_cut_count;
648 double **thread_part_weights;
650 double **thread_part_weight_work;
653 mj_scalar_t **thread_cut_left_closest_point;
655 mj_scalar_t **thread_cut_right_closest_point;
658 mj_lno_t **thread_point_counts;
660 mj_scalar_t *process_rectilinear_cut_weight;
661 mj_scalar_t *global_rectilinear_cut_weight;
667 mj_scalar_t *total_part_weight_left_right_closests ;
668 mj_scalar_t *global_total_part_weight_left_right_closests;
670 RCP<mj_partBoxVector_t> kept_boxes;
673 RCP<mj_partBox_t> global_box;
674 int myRank, myActualRank;
676 bool divide_to_prime_first;
685 void set_part_specifications();
692 inline mj_part_t get_part_count(
693 mj_part_t num_total_future,
699 void allocate_set_work_memory();
706 void init_part_boxes(RCP<mj_partBoxVector_t> & outPartBoxes);
709 void compute_global_box();
725 mj_part_t update_part_num_arrays(
726 std::vector<mj_part_t> &num_partitioning_in_current_dim,
727 std::vector<mj_part_t> *future_num_part_in_parts,
728 std::vector<mj_part_t> *next_future_num_parts_in_parts,
729 mj_part_t &future_num_parts,
730 mj_part_t current_num_parts,
731 int current_iteration,
732 RCP<mj_partBoxVector_t> input_part_boxes,
733 RCP<mj_partBoxVector_t> output_part_boxes,
734 mj_part_t atomic_part_count);
747 void mj_get_local_min_max_coord_totW(
748 mj_lno_t coordinate_begin_index,
749 mj_lno_t coordinate_end_index,
750 mj_lno_t *mj_current_coordinate_permutations,
751 mj_scalar_t *mj_current_dim_coords,
752 mj_scalar_t &min_coordinate,
753 mj_scalar_t &max_coordinate,
754 mj_scalar_t &total_weight);
763 void mj_get_global_min_max_coord_totW(
764 mj_part_t current_concurrent_num_parts,
765 mj_scalar_t *local_min_max_total,
766 mj_scalar_t *global_min_max_total);
795 void mj_get_initial_cut_coords_target_weights(
796 mj_scalar_t min_coord,
797 mj_scalar_t max_coord,
799 mj_scalar_t global_weight,
800 mj_scalar_t *initial_cut_coords ,
801 mj_scalar_t *target_part_weights ,
803 std::vector <mj_part_t> *future_num_part_in_parts,
804 std::vector <mj_part_t> *next_future_num_parts_in_parts,
805 mj_part_t concurrent_current_part,
806 mj_part_t obtained_part_index,
807 mj_part_t num_target_first_level_parts = 1,
808 const mj_part_t *target_first_level_dist = NULL);
822 void set_initial_coordinate_parts(
823 mj_scalar_t &max_coordinate,
824 mj_scalar_t &min_coordinate,
825 mj_part_t &concurrent_current_part_index,
826 mj_lno_t coordinate_begin_index,
827 mj_lno_t coordinate_end_index,
828 mj_lno_t *mj_current_coordinate_permutations,
829 mj_scalar_t *mj_current_dim_coords,
830 mj_part_t *mj_part_ids,
831 mj_part_t &partition_count);
844 mj_scalar_t *mj_current_dim_coords,
845 double imbalanceTolerance,
846 mj_part_t current_work_part,
847 mj_part_t current_concurrent_num_parts,
848 mj_scalar_t *current_cut_coordinates,
849 mj_part_t total_incomplete_cut_count,
850 std::vector <mj_part_t> &num_partitioning_in_current_dim);
871 void mj_1D_part_get_thread_part_weights(
872 size_t total_part_count,
874 mj_scalar_t max_coord,
875 mj_scalar_t min_coord,
876 mj_lno_t coordinate_begin_index,
877 mj_lno_t coordinate_end_index,
878 mj_scalar_t *mj_current_dim_coords,
879 mj_scalar_t *temp_current_cut_coords,
880 bool *current_cut_status,
881 double *my_current_part_weights,
882 mj_scalar_t *my_current_left_closest,
883 mj_scalar_t *my_current_right_closest);
892 void mj_accumulate_thread_results(
893 const std::vector <mj_part_t> &num_partitioning_in_current_dim,
894 mj_part_t current_work_part,
895 mj_part_t current_concurrent_num_parts);
927 void mj_get_new_cut_coordinates(
928 const size_t &num_total_part,
929 const mj_part_t &num_cuts,
930 const mj_scalar_t &max_coordinate,
931 const mj_scalar_t &min_coordinate,
932 const mj_scalar_t &global_total_weight,
933 const double &used_imbalance_tolerance,
934 mj_scalar_t * current_global_part_weights,
935 const mj_scalar_t * current_local_part_weights,
936 const mj_scalar_t *current_part_target_weights,
937 bool *current_cut_line_determined,
938 mj_scalar_t *current_cut_coordinates,
939 mj_scalar_t *current_cut_upper_bounds,
940 mj_scalar_t *current_cut_lower_bounds,
941 mj_scalar_t *current_global_left_closest_points,
942 mj_scalar_t *current_global_right_closest_points,
943 mj_scalar_t * current_cut_lower_bound_weights,
944 mj_scalar_t * current_cut_upper_weights,
945 mj_scalar_t *new_current_cut_coordinates,
946 mj_scalar_t *current_part_cut_line_weight_to_put_left,
947 mj_part_t *rectilinear_cut_count,
948 mj_part_t &my_num_incomplete_cut);
959 void mj_calculate_new_cut_position (
960 mj_scalar_t cut_upper_bound,
961 mj_scalar_t cut_lower_bound,
962 mj_scalar_t cut_upper_weight,
963 mj_scalar_t cut_lower_weight,
964 mj_scalar_t expected_weight,
965 mj_scalar_t &new_cut_position);
977 void mj_create_new_partitions(
979 mj_scalar_t *mj_current_dim_coords,
980 mj_scalar_t *current_concurrent_cut_coordinate,
981 mj_lno_t coordinate_begin,
982 mj_lno_t coordinate_end,
983 mj_scalar_t *used_local_cut_line_weight_to_left,
984 double **used_thread_part_weight_work,
985 mj_lno_t *out_part_xadj);
1009 bool mj_perform_migration(
1010 mj_part_t in_num_parts,
1011 mj_part_t &out_num_parts,
1012 std::vector<mj_part_t> *next_future_num_parts_in_parts,
1013 mj_part_t &output_part_begin_index,
1014 size_t migration_reduce_all_population,
1015 mj_lno_t num_coords_for_last_dim_part,
1016 std::string iteration,
1017 RCP<mj_partBoxVector_t> &input_part_boxes,
1018 RCP<mj_partBoxVector_t> &output_part_boxes);
1029 void get_processor_num_points_in_parts(
1030 mj_part_t num_procs,
1031 mj_part_t num_parts,
1032 mj_gno_t *&num_points_in_all_processor_parts);
1046 bool mj_check_to_migrate(
1047 size_t migration_reduce_all_population,
1048 mj_lno_t num_coords_for_last_dim_part,
1049 mj_part_t num_procs,
1050 mj_part_t num_parts,
1051 mj_gno_t *num_points_in_all_processor_parts);
1071 void mj_migration_part_proc_assignment(
1072 mj_gno_t * num_points_in_all_processor_parts,
1073 mj_part_t num_parts,
1074 mj_part_t num_procs,
1075 mj_lno_t *send_count_to_each_proc,
1076 std::vector<mj_part_t> &processor_ranks_for_subcomm,
1077 std::vector<mj_part_t> *next_future_num_parts_in_parts,
1078 mj_part_t &out_num_part,
1079 std::vector<mj_part_t> &out_part_indices,
1080 mj_part_t &output_part_numbering_begin_index,
1081 int *coordinate_destinations);
1099 void mj_assign_proc_to_parts(
1100 mj_gno_t * num_points_in_all_processor_parts,
1101 mj_part_t num_parts,
1102 mj_part_t num_procs,
1103 mj_lno_t *send_count_to_each_proc,
1104 std::vector<mj_part_t> &processor_ranks_for_subcomm,
1105 std::vector<mj_part_t> *next_future_num_parts_in_parts,
1106 mj_part_t &out_part_index,
1107 mj_part_t &output_part_numbering_begin_index,
1108 int *coordinate_destinations);
1120 void assign_send_destinations(
1121 mj_part_t num_parts,
1122 mj_part_t *part_assignment_proc_begin_indices,
1123 mj_part_t *processor_chains_in_parts,
1124 mj_lno_t *send_count_to_each_proc,
1125 int *coordinate_destinations);
1139 void assign_send_destinations2(
1140 mj_part_t num_parts,
1142 int *coordinate_destinations,
1143 mj_part_t &output_part_numbering_begin_index,
1144 std::vector<mj_part_t> *next_future_num_parts_in_parts);
1162 void mj_assign_parts_to_procs(
1163 mj_gno_t * num_points_in_all_processor_parts,
1164 mj_part_t num_parts,
1165 mj_part_t num_procs,
1166 mj_lno_t *send_count_to_each_proc,
1167 std::vector<mj_part_t> *next_future_num_parts_in_parts,
1168 mj_part_t &out_num_part,
1169 std::vector<mj_part_t> &out_part_indices,
1170 mj_part_t &output_part_numbering_begin_index,
1171 int *coordinate_destinations);
1185 void mj_migrate_coords(
1186 mj_part_t num_procs,
1187 mj_lno_t &num_new_local_points,
1188 std::string iteration,
1189 int *coordinate_destinations,
1190 mj_part_t num_parts);
1198 void create_sub_communicator(std::vector<mj_part_t> &processor_ranks_for_subcomm);
1206 void fill_permutation_array(
1207 mj_part_t output_num_parts,
1208 mj_part_t num_parts);
1218 void set_final_parts(
1219 mj_part_t current_num_parts,
1220 mj_part_t output_part_begin_index,
1221 RCP<mj_partBoxVector_t> &output_part_boxes,
1222 bool is_data_ever_migrated);
1225 void free_work_memory();
1239 void create_consistent_chunks(
1240 mj_part_t num_parts,
1241 mj_scalar_t *mj_current_dim_coords,
1242 mj_scalar_t *current_concurrent_cut_coordinate,
1243 mj_lno_t coordinate_begin,
1244 mj_lno_t coordinate_end,
1245 mj_scalar_t *used_local_cut_line_weight_to_left,
1246 mj_lno_t *out_part_xadj,
1253 mj_part_t find_largest_prime_factor(mj_part_t num_parts){
1254 mj_part_t largest_factor = 1;
1255 mj_part_t n = num_parts;
1256 mj_part_t divisor = 2;
1258 while (n % divisor == 0){
1260 largest_factor = divisor;
1263 if (divisor * divisor > n){
1270 return largest_factor;
1303 const RCP<const Environment> &env,
1304 RCP<
const Comm<int> > &problemComm,
1306 double imbalance_tolerance,
1307 size_t num_global_parts,
1308 mj_part_t *part_no_array,
1309 int recursion_depth,
1312 mj_lno_t num_local_coords,
1313 mj_gno_t num_global_coords,
1314 const mj_gno_t *initial_mj_gnos,
1315 mj_scalar_t **mj_coordinates,
1317 int num_weights_per_coord,
1318 bool *mj_uniform_weights,
1319 mj_scalar_t **mj_weights,
1320 bool *mj_uniform_parts,
1321 mj_scalar_t **mj_part_sizes,
1323 mj_part_t *&result_assigned_part_ids,
1324 mj_gno_t *&result_mj_gnos);
1336 bool distribute_points_on_cut_lines_,
1337 int max_concurrent_part_calculation_,
1338 int check_migrate_avoid_migration_option_,
1339 double minimum_migration_imbalance_,
int migration_type_ = 0);
1353 RCP<mj_partBoxVector_t> &localPartBoxes)
const;
1398 const RCP<const Environment> &env,
1399 mj_lno_t num_total_coords,
1400 mj_lno_t num_selected_coords,
1401 size_t num_target_part,
1403 mj_scalar_t **mj_coordinates,
1404 mj_lno_t *initial_selected_coords_output_permutation,
1405 mj_lno_t *output_xadj,
1406 int recursion_depth,
1407 const mj_part_t *part_no_array,
1408 bool partition_along_longest_dim,
1409 int num_ranks_per_node,
1410 bool divide_to_prime_first_,
1411 mj_part_t num_first_level_parts_ = 1,
1412 const mj_part_t *first_level_distribution_ = NULL);
1458 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
1461 const RCP<const Environment> &env,
1462 mj_lno_t num_total_coords,
1463 mj_lno_t num_selected_coords,
1464 size_t num_target_part,
1466 mj_scalar_t **mj_coordinates_,
1467 mj_lno_t *inital_adjList_output_adjlist,
1468 mj_lno_t *output_xadj,
1470 const mj_part_t *part_no_array_,
1471 bool partition_along_longest_dim,
1472 int num_ranks_per_node,
1473 bool divide_to_prime_first_,
1474 mj_part_t num_first_level_parts_,
1475 const mj_part_t *first_level_distribution_) {
1478 const RCP<Comm<int> > commN;
1479 this->mj_problemComm =
1480 Teuchos::DefaultComm<int>::getDefaultSerialComm(commN);
1482 Teuchos::rcp_const_cast<Comm<int> >(this->mj_problemComm);
1483 this->myActualRank = this->myRank = 1;
1485 #ifdef HAVE_ZOLTAN2_OMP
1490 this->divide_to_prime_first = divide_to_prime_first_;
1495 this->imbalance_tolerance = 0;
1496 this->num_global_parts = num_target_part;
1497 this->part_no_array = (mj_part_t *)part_no_array_;
1498 this->recursion_depth = rd;
1502 this->num_first_level_parts = num_first_level_parts_;
1503 this->first_level_distribution = (mj_part_t *)first_level_distribution_;
1505 this->coord_dim = coord_dim_;
1506 this->num_local_coords = num_total_coords;
1507 this->num_global_coords = num_total_coords;
1508 this->mj_coordinates = mj_coordinates_;
1512 this->initial_mj_gnos = allocMemory<mj_gno_t>(this->num_local_coords);
1514 this->num_weights_per_coord = 0;
1515 bool *tmp_mj_uniform_weights =
new bool[1];
1516 this->mj_uniform_weights = tmp_mj_uniform_weights;
1517 this->mj_uniform_weights[0] =
true;
1519 mj_scalar_t **tmp_mj_weights =
new mj_scalar_t *[1];
1520 this->mj_weights = tmp_mj_weights;
1522 bool *tmp_mj_uniform_parts =
new bool[1];
1523 this->mj_uniform_parts = tmp_mj_uniform_parts;
1524 this->mj_uniform_parts[0] =
true;
1526 mj_scalar_t **tmp_mj_part_sizes =
new mj_scalar_t * [1];
1527 this->mj_part_sizes = tmp_mj_part_sizes;
1528 this->mj_part_sizes[0] = NULL;
1530 this->num_threads = 1;
1531 this->set_part_specifications();
1533 this->allocate_set_work_memory();
1535 this->part_xadj[0] =
static_cast<mj_lno_t
>(num_selected_coords);
1536 for(
size_t i = 0; i < static_cast<size_t>(num_total_coords); ++i){
1537 this->coordinate_permutations[i] = inital_adjList_output_adjlist[i];
1540 mj_part_t current_num_parts = 1;
1542 mj_scalar_t *current_cut_coordinates = this->all_cut_coordinates;
1544 mj_part_t future_num_parts = this->total_num_part;
1546 std::vector<mj_part_t> *future_num_part_in_parts =
new std::vector<mj_part_t> ();
1547 std::vector<mj_part_t> *next_future_num_parts_in_parts =
new std::vector<mj_part_t> ();
1548 next_future_num_parts_in_parts->push_back(this->num_global_parts);
1549 RCP<mj_partBoxVector_t> t1;
1550 RCP<mj_partBoxVector_t> t2;
1553 std::vector <uSignedSortItem<int, mj_scalar_t, char> > coord_dimension_range_sorted(this->coord_dim);
1555 std::vector <mj_scalar_t> coord_dim_mins(this->coord_dim);
1556 std::vector <mj_scalar_t> coord_dim_maxs(this->coord_dim);
1558 for (
int i = 0; i < this->recursion_depth; ++i) {
1562 std::vector <mj_part_t> num_partitioning_in_current_dim;
1576 std::vector<mj_part_t> *tmpPartVect= future_num_part_in_parts;
1577 future_num_part_in_parts = next_future_num_parts_in_parts;
1578 next_future_num_parts_in_parts = tmpPartVect;
1583 next_future_num_parts_in_parts->clear();
1587 mj_part_t output_part_count_in_dimension =
1588 this->update_part_num_arrays(
1589 num_partitioning_in_current_dim,
1590 future_num_part_in_parts,
1591 next_future_num_parts_in_parts,
1596 t2, num_ranks_per_node);
1601 if(output_part_count_in_dimension == current_num_parts) {
1602 tmpPartVect= future_num_part_in_parts;
1603 future_num_part_in_parts = next_future_num_parts_in_parts;
1604 next_future_num_parts_in_parts = tmpPartVect;
1609 std::string istring = Teuchos::toString<int>(i);
1613 this->new_part_xadj = allocMemory<mj_lno_t>(output_part_count_in_dimension);
1616 mj_part_t output_part_index = 0;
1619 mj_part_t output_coordinate_end_index = 0;
1621 mj_part_t current_work_part = 0;
1622 mj_part_t current_concurrent_num_parts = 1;
1624 mj_part_t obtained_part_index = 0;
1627 int coordInd = i % this->coord_dim;
1628 mj_scalar_t * mj_current_dim_coords = this->mj_coordinates[coordInd];
1632 for (; current_work_part < current_num_parts;
1633 current_work_part += current_concurrent_num_parts) {
1639 mj_part_t actual_work_part_count = 0;
1643 for (
int kk = 0; kk < current_concurrent_num_parts; ++kk) {
1644 mj_part_t current_work_part_in_concurrent_parts = current_work_part + kk;
1648 if (num_partitioning_in_current_dim[current_work_part_in_concurrent_parts] == 1){
1651 ++actual_work_part_count;
1652 mj_lno_t coordinate_end_index= this->part_xadj[current_work_part_in_concurrent_parts];
1653 mj_lno_t coordinate_begin_index = current_work_part_in_concurrent_parts == 0 ?
1654 0 : this->part_xadj[current_work_part_in_concurrent_parts - 1];
1664 if (partition_along_longest_dim) {
1666 mj_scalar_t best_weight_coord = 0;
1667 for (
int coord_traverse_ind = 0; coord_traverse_ind < this->coord_dim; ++coord_traverse_ind){
1668 mj_scalar_t best_min_coord = 0;
1669 mj_scalar_t best_max_coord = 0;
1672 this->mj_get_local_min_max_coord_totW(
1673 coordinate_begin_index,
1674 coordinate_end_index,
1675 this->coordinate_permutations,
1676 this->mj_coordinates[coord_traverse_ind],
1682 coord_dim_mins[coord_traverse_ind] = best_min_coord;
1683 coord_dim_maxs[coord_traverse_ind] = best_max_coord;
1684 mj_scalar_t best_range = best_max_coord - best_min_coord;
1685 coord_dimension_range_sorted[coord_traverse_ind].id = coord_traverse_ind;
1686 coord_dimension_range_sorted[coord_traverse_ind].val = best_range;
1687 coord_dimension_range_sorted[coord_traverse_ind].signbit = 1;
1691 uqSignsort(this->coord_dim, p_coord_dimension_range_sorted);
1692 coordInd = p_coord_dimension_range_sorted[this->coord_dim - 1].
id;
1707 mj_current_dim_coords = this->mj_coordinates[coordInd];
1709 this->process_local_min_max_coord_total_weight[kk] = coord_dim_mins[coordInd];
1710 this->process_local_min_max_coord_total_weight[kk+ current_concurrent_num_parts] = coord_dim_maxs[coordInd];
1711 this->process_local_min_max_coord_total_weight[kk + 2*current_concurrent_num_parts] = best_weight_coord;
1715 this->mj_get_local_min_max_coord_totW(
1716 coordinate_begin_index,
1717 coordinate_end_index,
1718 this->coordinate_permutations,
1719 mj_current_dim_coords,
1720 this->process_local_min_max_coord_total_weight[kk],
1721 this->process_local_min_max_coord_total_weight[kk + current_concurrent_num_parts],
1722 this->process_local_min_max_coord_total_weight[kk + 2*current_concurrent_num_parts]
1728 if (actual_work_part_count > 0) {
1730 this->mj_get_global_min_max_coord_totW(
1731 current_concurrent_num_parts,
1732 this->process_local_min_max_coord_total_weight,
1733 this->global_min_max_coord_total_weight);
1737 mj_part_t total_incomplete_cut_count = 0;
1742 mj_part_t concurrent_part_cut_shift = 0;
1743 mj_part_t concurrent_part_part_shift = 0;
1746 for (
int kk = 0; kk < current_concurrent_num_parts; ++kk) {
1747 mj_scalar_t min_coordinate = this->global_min_max_coord_total_weight[kk];
1748 mj_scalar_t max_coordinate = this->global_min_max_coord_total_weight[kk +
1749 current_concurrent_num_parts];
1750 mj_scalar_t global_total_weight = this->global_min_max_coord_total_weight[kk +
1751 2 * current_concurrent_num_parts];
1753 mj_part_t concurrent_current_part_index = current_work_part + kk;
1755 mj_part_t partition_count = num_partitioning_in_current_dim[concurrent_current_part_index];
1757 mj_scalar_t *usedCutCoordinate = current_cut_coordinates + concurrent_part_cut_shift;
1758 mj_scalar_t *current_target_part_weights = this->target_part_weights +
1759 concurrent_part_part_shift;
1761 concurrent_part_cut_shift += partition_count - 1;
1763 concurrent_part_part_shift += partition_count;
1767 if(partition_count > 1 && min_coordinate <= max_coordinate){
1771 total_incomplete_cut_count += partition_count - 1;
1774 this->my_incomplete_cut_count[kk] = partition_count - 1;
1780 first_level_distribution != NULL &&
1781 num_first_level_parts > 1) {
1783 this->mj_get_initial_cut_coords_target_weights(
1786 partition_count - 1,
1787 global_total_weight,
1789 current_target_part_weights,
1790 future_num_part_in_parts,
1791 next_future_num_parts_in_parts,
1792 concurrent_current_part_index,
1793 obtained_part_index,
1794 this->num_first_level_parts,
1795 this->first_level_distribution);
1801 this->mj_get_initial_cut_coords_target_weights(
1804 partition_count - 1,
1805 global_total_weight,
1807 current_target_part_weights,
1808 future_num_part_in_parts,
1809 next_future_num_parts_in_parts,
1810 concurrent_current_part_index,
1811 obtained_part_index);
1814 mj_lno_t coordinate_end_index = this->part_xadj[concurrent_current_part_index];
1815 mj_lno_t coordinate_begin_index = concurrent_current_part_index == 0 ?
1816 0 : this->part_xadj[concurrent_current_part_index - 1];
1819 this->set_initial_coordinate_parts(
1822 concurrent_current_part_index,
1823 coordinate_begin_index, coordinate_end_index,
1824 this->coordinate_permutations,
1825 mj_current_dim_coords,
1826 this->assigned_part_ids,
1832 this->my_incomplete_cut_count[kk] = 0;
1834 obtained_part_index += partition_count;
1838 double used_imbalance = 0;
1843 mj_current_dim_coords,
1846 current_concurrent_num_parts,
1847 current_cut_coordinates,
1848 total_incomplete_cut_count,
1849 num_partitioning_in_current_dim);
1852 obtained_part_index += current_concurrent_num_parts;
1858 mj_part_t output_array_shift = 0;
1859 mj_part_t cut_shift = 0;
1860 size_t tlr_shift = 0;
1861 size_t partweight_array_shift = 0;
1863 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk){
1864 mj_part_t current_concurrent_work_part = current_work_part + kk;
1865 mj_part_t num_parts = num_partitioning_in_current_dim[current_concurrent_work_part];
1868 if((num_parts != 1 ) && this->global_min_max_coord_total_weight[kk] >
1869 this->global_min_max_coord_total_weight[kk + current_concurrent_num_parts]) {
1871 for(mj_part_t jj = 0; jj < num_parts; ++jj){
1872 this->new_part_xadj[output_part_index + output_array_shift + jj] = 0;
1874 cut_shift += num_parts - 1;
1875 tlr_shift += (4 *(num_parts - 1) + 1);
1876 output_array_shift += num_parts;
1877 partweight_array_shift += (2 * (num_parts - 1) + 1);
1881 mj_lno_t coordinate_end = this->part_xadj[current_concurrent_work_part];
1882 mj_lno_t coordinate_begin = current_concurrent_work_part==0 ? 0: this->part_xadj[current_concurrent_work_part
1884 mj_scalar_t *current_concurrent_cut_coordinate = current_cut_coordinates + cut_shift;
1885 mj_scalar_t *used_local_cut_line_weight_to_left = this->process_cut_line_weight_to_put_left +
1888 for(
int ii = 0; ii < this->num_threads; ++ii){
1889 this->thread_part_weight_work[ii] = this->thread_part_weights[ii] + partweight_array_shift;
1894 this->create_consistent_chunks(
1896 mj_current_dim_coords,
1897 current_concurrent_cut_coordinate,
1900 used_local_cut_line_weight_to_left,
1901 this->new_part_xadj + output_part_index + output_array_shift,
1903 partition_along_longest_dim,
1904 p_coord_dimension_range_sorted);
1909 mj_lno_t part_size = coordinate_end - coordinate_begin;
1910 *(this->new_part_xadj + output_part_index + output_array_shift) = part_size;
1911 memcpy(this->new_coordinate_permutations + coordinate_begin,
1912 this->coordinate_permutations + coordinate_begin,
1913 part_size *
sizeof(mj_lno_t));
1918 cut_shift += num_parts - 1;
1919 tlr_shift += (4 *(num_parts - 1) + 1);
1920 output_array_shift += num_parts;
1921 partweight_array_shift += (2 * (num_parts - 1) + 1);
1930 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
1931 mj_part_t num_parts = num_partitioning_in_current_dim[ current_work_part + kk];
1932 for (mj_part_t ii = 0;ii < num_parts ; ++ii){
1934 this->new_part_xadj[output_part_index+ii] += output_coordinate_end_index;
1936 mj_lno_t coordinate_end = this->new_part_xadj[output_part_index+ii];
1937 mj_lno_t coordinate_begin = this->new_part_xadj[output_part_index];
1939 for (mj_lno_t task_traverse = coordinate_begin; task_traverse < coordinate_end; ++task_traverse){
1940 mj_lno_t l = this->new_coordinate_permutations[task_traverse];
1942 mj_current_dim_coords[l] = -mj_current_dim_coords[l];
1947 output_coordinate_end_index = this->new_part_xadj[output_part_index + num_parts - 1];
1949 output_part_index += num_parts ;
1956 current_num_parts = output_part_count_in_dimension;
1959 mj_lno_t * tmp = this->coordinate_permutations;
1960 this->coordinate_permutations = this->new_coordinate_permutations;
1961 this->new_coordinate_permutations = tmp;
1963 freeArray<mj_lno_t>(this->part_xadj);
1964 this->part_xadj = this->new_part_xadj;
1965 this->new_part_xadj = NULL;
1968 for(mj_lno_t i = 0; i < num_total_coords; ++i){
1969 inital_adjList_output_adjlist[i] = this->coordinate_permutations[i];
1974 for(
size_t i = 0; i < this->num_global_parts ; ++i){
1975 output_xadj[i+1] = this->part_xadj[i];
1978 delete future_num_part_in_parts;
1979 delete next_future_num_parts_in_parts;
1982 freeArray<mj_part_t>(this->assigned_part_ids);
1983 freeArray<mj_gno_t>(this->initial_mj_gnos);
1984 freeArray<mj_gno_t>(this->current_mj_gnos);
1985 freeArray<bool>(tmp_mj_uniform_weights);
1986 freeArray<bool>(tmp_mj_uniform_parts);
1987 freeArray<mj_scalar_t *>(tmp_mj_weights);
1988 freeArray<mj_scalar_t *>(tmp_mj_part_sizes);
1990 this->free_work_memory();
1992 #ifdef HAVE_ZOLTAN2_OMP
2000 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2003 mj_env(), mj_problemComm(), imbalance_tolerance(0),
2004 part_no_array(NULL), recursion_depth(0), coord_dim(0),
2005 num_weights_per_coord(0), initial_num_loc_coords(0),
2006 initial_num_glob_coords(0),
2007 num_local_coords(0), num_global_coords(0), mj_coordinates(NULL),
2008 mj_weights(NULL), mj_uniform_parts(NULL), mj_part_sizes(NULL),
2009 mj_uniform_weights(NULL), mj_gnos(), num_global_parts(1),
2010 initial_mj_gnos(NULL), current_mj_gnos(NULL), owner_of_coordinate(NULL),
2011 coordinate_permutations(NULL), new_coordinate_permutations(NULL),
2012 assigned_part_ids(NULL), part_xadj(NULL), new_part_xadj(NULL),
2013 distribute_points_on_cut_lines(true), max_concurrent_part_calculation(1),
2014 mj_run_as_rcb(false), mj_user_recursion_depth(0), mj_keep_part_boxes(false),
2015 check_migrate_avoid_migration_option(0), migration_type(0), minimum_migration_imbalance(0.30),
2016 num_threads(1), num_first_level_parts(1), first_level_distribution(NULL),
2017 total_num_cut(0), total_num_part(0), max_num_part_along_dim(0),
2018 max_num_cut_along_dim(0), max_num_total_part_along_dim(0), total_dim_num_reduce_all(0),
2019 last_dim_num_part(0), comm(), fEpsilon(0), sEpsilon(0), maxScalar_t(0), minScalar_t(0),
2020 all_cut_coordinates(NULL), max_min_coords(NULL), process_cut_line_weight_to_put_left(NULL),
2021 thread_cut_line_weight_to_put_left(NULL), cut_coordinates_work_array(NULL),
2022 target_part_weights(NULL), cut_upper_bound_coordinates(NULL), cut_lower_bound_coordinates(NULL),
2023 cut_lower_bound_weights(NULL), cut_upper_bound_weights(NULL),
2024 process_local_min_max_coord_total_weight(NULL), global_min_max_coord_total_weight(NULL),
2025 is_cut_line_determined(NULL), my_incomplete_cut_count(NULL),
2026 thread_part_weights(NULL), thread_part_weight_work(NULL),
2027 thread_cut_left_closest_point(NULL), thread_cut_right_closest_point(NULL),
2028 thread_point_counts(NULL), process_rectilinear_cut_weight(NULL),
2029 global_rectilinear_cut_weight(NULL),total_part_weight_left_right_closests(NULL),
2030 global_total_part_weight_left_right_closests(NULL),
2031 kept_boxes(),global_box(),
2032 myRank(0), myActualRank(0), divide_to_prime_first(false)
2037 this->maxScalar_t = std::numeric_limits<mj_scalar_t>::max();
2038 this->minScalar_t = -std::numeric_limits<mj_scalar_t>::max();
2046 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2048 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::mj_partBox_t>
2051 return this->global_box;
2057 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2060 this->mj_keep_part_boxes =
true;
2071 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2075 this->total_num_cut = 0;
2076 this->total_num_part = 1;
2077 this->max_num_part_along_dim = 0;
2078 this->total_dim_num_reduce_all = 0;
2079 this->last_dim_num_part = 1;
2082 this->max_num_cut_along_dim = 0;
2083 this->max_num_total_part_along_dim = 0;
2085 if (this->part_no_array) {
2087 for (
int i = 0; i < this->recursion_depth; ++i){
2088 this->total_dim_num_reduce_all += this->total_num_part;
2089 this->total_num_part *= this->part_no_array[i];
2090 if(this->part_no_array[i] > this->max_num_part_along_dim) {
2091 this->max_num_part_along_dim = this->part_no_array[i];
2094 this->last_dim_num_part = this->total_num_part / this->part_no_array[recursion_depth-1];
2095 this->num_global_parts = this->total_num_part;
2098 mj_part_t future_num_parts = this->num_global_parts;
2102 if (this->first_level_distribution != NULL &&
2103 this->num_first_level_parts > 1) {
2104 this->max_num_part_along_dim = this->num_first_level_parts;
2108 for (
int rd = 0; rd < this->recursion_depth; ++rd){
2110 mj_part_t maxNoPartAlongI = 0;
2111 mj_part_t nfutureNumParts = 0;
2116 this->first_level_distribution != NULL &&
2117 this->num_first_level_parts > 1) {
2119 maxNoPartAlongI = this->num_first_level_parts;
2120 this->max_num_part_along_dim = this->num_first_level_parts;
2122 mj_part_t sum_first_level_dist = 0;
2123 mj_part_t max_part = 0;
2126 for (
int i = 0; i < this->num_first_level_parts; ++i) {
2128 sum_first_level_dist += this->first_level_distribution[i];
2130 if (this->first_level_distribution[i] > max_part)
2131 max_part = this->first_level_distribution[i];
2135 nfutureNumParts = this->num_global_parts * max_part / sum_first_level_dist;
2140 maxNoPartAlongI = this->get_part_count(future_num_parts,
2141 1.0f / (this->recursion_depth - rd));
2143 if (maxNoPartAlongI > this->max_num_part_along_dim)
2144 this->max_num_part_along_dim = maxNoPartAlongI;
2147 nfutureNumParts = future_num_parts / maxNoPartAlongI;
2148 if (future_num_parts % maxNoPartAlongI){
2153 future_num_parts = nfutureNumParts;
2155 this->total_num_part = this->num_global_parts;
2157 if (this->divide_to_prime_first){
2158 this->total_dim_num_reduce_all = this->num_global_parts * 2;
2159 this->last_dim_num_part = this->num_global_parts;
2168 for (
int i = 0; i < this->recursion_depth; ++i){
2169 this->total_dim_num_reduce_all += p;
2170 p *= this->max_num_part_along_dim;
2173 if (p / this->max_num_part_along_dim > this->num_global_parts){
2174 this->last_dim_num_part = this->num_global_parts;
2177 this->last_dim_num_part = p / this->max_num_part_along_dim;
2183 this->total_num_cut = this->total_num_part - 1;
2184 this->max_num_cut_along_dim = this->max_num_part_along_dim - 1;
2185 this->max_num_total_part_along_dim = this->max_num_part_along_dim + size_t(this->max_num_cut_along_dim);
2189 if(this->max_concurrent_part_calculation > this->last_dim_num_part){
2190 if(this->mj_problemComm->getRank() == 0){
2191 std::cerr <<
"Warning: Concurrent part count ("<< this->max_concurrent_part_calculation <<
2192 ") has been set bigger than maximum amount that can be used." <<
2193 " Setting to:" << this->last_dim_num_part <<
"." << std::endl;
2195 this->max_concurrent_part_calculation = this->last_dim_num_part;
2204 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2206 inline mj_part_t AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::get_part_count(
2207 mj_part_t num_total_future,
2210 double fp = pow(num_total_future, root);
2211 mj_part_t ip = mj_part_t (fp);
2212 if (fp - ip < this->fEpsilon * 100){
2234 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2236 mj_part_t AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::update_part_num_arrays(
2237 std::vector <mj_part_t> &num_partitioning_in_current_dim,
2238 std::vector<mj_part_t> *future_num_part_in_parts,
2239 std::vector<mj_part_t> *next_future_num_parts_in_parts,
2240 mj_part_t &future_num_parts,
2241 mj_part_t current_num_parts,
2242 int current_iteration,
2243 RCP<mj_partBoxVector_t> input_part_boxes,
2244 RCP<mj_partBoxVector_t> output_part_boxes,
2245 mj_part_t atomic_part_count) {
2248 mj_part_t output_num_parts = 0;
2250 if(this->part_no_array){
2255 mj_part_t p = this->part_no_array[current_iteration];
2257 std::cout <<
"Current recursive iteration: " << current_iteration
2258 <<
" part_no_array[" << current_iteration <<
"] is given as:" << p << std::endl;
2262 return current_num_parts;
2265 if (this->first_level_distribution != NULL &&
2266 current_iteration == 0 &&
2267 p != this->num_first_level_parts)
2269 std::cout <<
"Current recursive iteration: " << current_iteration
2270 <<
" part_no_array[" << current_iteration <<
"] is given as: " << p
2271 <<
" and contradicts num_first_level_parts: " << this->num_first_level_parts << std::endl;
2275 for (mj_part_t ii = 0; ii < current_num_parts; ++ii){
2276 num_partitioning_in_current_dim.push_back(p);
2292 future_num_parts /= num_partitioning_in_current_dim[0];
2293 output_num_parts = current_num_parts * num_partitioning_in_current_dim[0];
2295 if (this->mj_keep_part_boxes){
2296 for (mj_part_t k = 0; k < current_num_parts; ++k){
2298 for (mj_part_t j = 0; j < num_partitioning_in_current_dim[0]; ++j){
2299 output_part_boxes->push_back((*input_part_boxes)[k]);
2307 for (mj_part_t ii = 0; ii < output_num_parts; ++ii){
2308 next_future_num_parts_in_parts->push_back(future_num_parts);
2318 future_num_parts = 1;
2322 for (mj_part_t ii = 0; ii < current_num_parts; ++ii){
2324 mj_part_t future_num_parts_of_part_ii = (*future_num_part_in_parts)[ii];
2328 mj_part_t num_partitions_in_current_dim =
2329 this->get_part_count(future_num_parts_of_part_ii,
2330 1.0 / (this->recursion_depth - current_iteration) );
2332 if (num_partitions_in_current_dim > this->max_num_part_along_dim){
2333 std::cerr <<
"ERROR: maxPartNo calculation is wrong. num_partitions_in_current_dim: "
2334 << num_partitions_in_current_dim <<
" this->max_num_part_along_dim: "
2335 << this->max_num_part_along_dim <<
2336 " this->recursion_depth: " << this->recursion_depth <<
2337 " current_iteration: " << current_iteration <<
2338 " future_num_parts_of_part_ii: " << future_num_parts_of_part_ii <<
2339 " might need to fix max part no calculation for largest_prime_first partitioning." <<
2352 if (current_iteration == 0 &&
2353 this->first_level_distribution != NULL &&
2354 this->num_first_level_parts > 1) {
2358 num_partitioning_in_current_dim.push_back(this->num_first_level_parts);
2361 output_num_parts = this->num_first_level_parts;
2364 future_num_parts /= this->num_first_level_parts;
2366 mj_part_t max_part = 0;
2367 mj_part_t sum_first_level_dist = 0;
2371 for (
int i = 0; i < this->num_first_level_parts; ++i) {
2372 sum_first_level_dist += this->first_level_distribution[i];
2374 if (this->first_level_distribution[i] > max_part)
2375 max_part = this->first_level_distribution[i];
2379 future_num_parts = this->num_global_parts * max_part / sum_first_level_dist;
2383 for (
int i = 0; i < this->num_first_level_parts; ++i) {
2385 next_future_num_parts_in_parts->push_back(this->first_level_distribution[i] *
2386 this->num_global_parts / sum_first_level_dist);
2389 else if (this->divide_to_prime_first) {
2392 num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
2394 mj_part_t largest_prime_factor = num_partitions_in_current_dim;
2397 output_num_parts += num_partitions_in_current_dim;
2399 if (future_num_parts_of_part_ii == atomic_part_count ||
2400 future_num_parts_of_part_ii % atomic_part_count != 0) {
2401 atomic_part_count = 1;
2404 largest_prime_factor =
2405 this->find_largest_prime_factor(future_num_parts_of_part_ii / atomic_part_count);
2412 if (largest_prime_factor < num_partitions_in_current_dim) {
2413 largest_prime_factor = num_partitions_in_current_dim;
2417 mj_part_t ideal_num_future_parts_in_part =
2418 (future_num_parts_of_part_ii / atomic_part_count) / largest_prime_factor;
2420 mj_part_t ideal_prime_scale = largest_prime_factor / num_partitions_in_current_dim;
2428 for (mj_part_t iii = 0; iii < num_partitions_in_current_dim; ++iii) {
2430 mj_part_t my_ideal_primescale = ideal_prime_scale;
2432 if (iii < (largest_prime_factor) % num_partitions_in_current_dim) {
2433 ++my_ideal_primescale;
2436 mj_part_t num_future_parts_for_part_iii =
2437 ideal_num_future_parts_in_part * my_ideal_primescale;
2440 if (iii < (future_num_parts_of_part_ii / atomic_part_count) % largest_prime_factor) {
2442 ++num_future_parts_for_part_iii;
2445 next_future_num_parts_in_parts->push_back(num_future_parts_for_part_iii * atomic_part_count);
2448 if (this->mj_keep_part_boxes) {
2449 output_part_boxes->push_back((*input_part_boxes)[ii]);
2453 if (num_future_parts_for_part_iii > future_num_parts)
2454 future_num_parts = num_future_parts_for_part_iii;
2461 num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
2464 output_num_parts += num_partitions_in_current_dim;
2466 if (future_num_parts_of_part_ii == atomic_part_count ||
2467 future_num_parts_of_part_ii % atomic_part_count != 0) {
2468 atomic_part_count = 1;
2471 mj_part_t ideal_num_future_parts_in_part =
2472 (future_num_parts_of_part_ii / atomic_part_count) / num_partitions_in_current_dim;
2474 for (mj_part_t iii = 0; iii < num_partitions_in_current_dim; ++iii){
2475 mj_part_t num_future_parts_for_part_iii = ideal_num_future_parts_in_part;
2478 if (iii < (future_num_parts_of_part_ii / atomic_part_count) % num_partitions_in_current_dim){
2480 ++num_future_parts_for_part_iii;
2483 next_future_num_parts_in_parts->push_back(num_future_parts_for_part_iii * atomic_part_count);
2486 if (this->mj_keep_part_boxes){
2487 output_part_boxes->push_back((*input_part_boxes)[ii]);
2491 if (num_future_parts_for_part_iii > future_num_parts)
2492 future_num_parts = num_future_parts_for_part_iii;
2497 return output_num_parts;
2504 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2506 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::allocate_set_work_memory(){
2509 this->owner_of_coordinate = NULL;
2514 this->coordinate_permutations = allocMemory< mj_lno_t>(this->num_local_coords);
2516 #ifdef HAVE_ZOLTAN2_OMP
2517 #pragma omp parallel for
2519 for(mj_lno_t i = 0; i < this->num_local_coords; ++i){
2520 this->coordinate_permutations[i] = i;
2524 this->new_coordinate_permutations = allocMemory< mj_lno_t>(this->num_local_coords);
2526 this->assigned_part_ids = NULL;
2527 if(this->num_local_coords > 0){
2528 this->assigned_part_ids = allocMemory<mj_part_t>(this->num_local_coords);
2534 this->part_xadj = allocMemory<mj_lno_t>(1);
2535 this->part_xadj[0] =
static_cast<mj_lno_t
>(this->num_local_coords);
2537 this->new_part_xadj = NULL;
2543 this->all_cut_coordinates = allocMemory< mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2545 this->max_min_coords = allocMemory< mj_scalar_t>(this->num_threads * 2);
2547 this->process_cut_line_weight_to_put_left = NULL;
2548 this->thread_cut_line_weight_to_put_left = NULL;
2550 if(this->distribute_points_on_cut_lines){
2551 this->process_cut_line_weight_to_put_left = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2552 this->thread_cut_line_weight_to_put_left = allocMemory<mj_scalar_t *>(this->num_threads);
2553 for(
int i = 0; i < this->num_threads; ++i){
2554 this->thread_cut_line_weight_to_put_left[i] = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim);
2556 this->process_rectilinear_cut_weight = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim);
2557 this->global_rectilinear_cut_weight = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim);
2565 this->cut_coordinates_work_array = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim *
2566 this->max_concurrent_part_calculation);
2570 this->target_part_weights = allocMemory<mj_scalar_t>(
2571 this->max_num_part_along_dim * this->max_concurrent_part_calculation);
2574 this->cut_upper_bound_coordinates = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2575 this->cut_lower_bound_coordinates = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2576 this->cut_lower_bound_weights = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2577 this->cut_upper_bound_weights = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2579 this->process_local_min_max_coord_total_weight = allocMemory<mj_scalar_t>(3 * this->max_concurrent_part_calculation);
2580 this->global_min_max_coord_total_weight = allocMemory<mj_scalar_t>(3 * this->max_concurrent_part_calculation);
2584 this->is_cut_line_determined = allocMemory<bool>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2587 this->my_incomplete_cut_count = allocMemory<mj_part_t>(this->max_concurrent_part_calculation);
2589 this->thread_part_weights = allocMemory<double *>(this->num_threads);
2591 this->thread_part_weight_work = allocMemory<double *>(this->num_threads);
2594 this->thread_cut_left_closest_point = allocMemory<mj_scalar_t *>(this->num_threads);
2596 this->thread_cut_right_closest_point = allocMemory<mj_scalar_t *>(this->num_threads);
2599 this->thread_point_counts = allocMemory<mj_lno_t *>(this->num_threads);
2601 for(
int i = 0; i < this->num_threads; ++i){
2603 this->thread_part_weights[i] = allocMemory < double >(this->max_num_total_part_along_dim * this->max_concurrent_part_calculation);
2604 this->thread_cut_right_closest_point[i] = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2605 this->thread_cut_left_closest_point[i] = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2606 this->thread_point_counts[i] = allocMemory<mj_lno_t>(this->max_num_part_along_dim);
2612 this->total_part_weight_left_right_closests = allocMemory<mj_scalar_t>((this->max_num_total_part_along_dim + this->max_num_cut_along_dim * 2) * this->max_concurrent_part_calculation);
2613 this->global_total_part_weight_left_right_closests = allocMemory<mj_scalar_t>((this->max_num_total_part_along_dim + this->max_num_cut_along_dim * 2) * this->max_concurrent_part_calculation);
2616 mj_scalar_t **coord = allocMemory<mj_scalar_t *>(this->coord_dim);
2617 for (
int i=0; i < this->coord_dim; i++){
2618 coord[i] = allocMemory<mj_scalar_t>(this->num_local_coords);
2619 #ifdef HAVE_ZOLTAN2_OMP
2620 #pragma omp parallel for
2622 for (mj_lno_t j=0; j < this->num_local_coords; j++)
2623 coord[i][j] = this->mj_coordinates[i][j];
2625 this->mj_coordinates = coord;
2628 int criteria_dim = (this->num_weights_per_coord ? this->num_weights_per_coord : 1);
2629 mj_scalar_t **
weights = allocMemory<mj_scalar_t *>(criteria_dim);
2631 for (
int i=0; i < criteria_dim; i++){
2634 for (
int i=0; i < this->num_weights_per_coord; i++){
2635 weights[i] = allocMemory<mj_scalar_t>(this->num_local_coords);
2636 #ifdef HAVE_ZOLTAN2_OMP
2637 #pragma omp parallel for
2639 for (mj_lno_t j=0; j < this->num_local_coords; j++)
2640 weights[i][j] = this->mj_weights[i][j];
2644 this->current_mj_gnos = allocMemory<mj_gno_t>(this->num_local_coords);
2645 #ifdef HAVE_ZOLTAN2_OMP
2646 #pragma omp parallel for
2648 for (mj_lno_t j=0; j < this->num_local_coords; j++)
2649 this->current_mj_gnos[j] = this->initial_mj_gnos[j];
2651 this->owner_of_coordinate = allocMemory<int>(this->num_local_coords);
2653 #ifdef HAVE_ZOLTAN2_OMP
2654 #pragma omp parallel for
2656 for (mj_lno_t j=0; j < this->num_local_coords; j++)
2657 this->owner_of_coordinate[j] = this->myActualRank;
2662 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2664 void AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::compute_global_box()
2667 mj_scalar_t *mins = allocMemory<mj_scalar_t>(this->coord_dim);
2669 mj_scalar_t *gmins = allocMemory<mj_scalar_t>(this->coord_dim);
2671 mj_scalar_t *maxs = allocMemory<mj_scalar_t>(this->coord_dim);
2673 mj_scalar_t *gmaxs = allocMemory<mj_scalar_t>(this->coord_dim);
2675 for (
int i = 0; i < this->coord_dim; ++i){
2676 mj_scalar_t localMin = std::numeric_limits<mj_scalar_t>::max();
2677 mj_scalar_t localMax = -localMin;
2678 if (localMax > 0) localMax = 0;
2681 for (mj_lno_t j = 0; j < this->num_local_coords; ++j){
2682 if (this->mj_coordinates[i][j] < localMin){
2683 localMin = this->mj_coordinates[i][j];
2685 if (this->mj_coordinates[i][j] > localMax){
2686 localMax = this->mj_coordinates[i][j];
2695 reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MIN,
2696 this->coord_dim, mins, gmins
2700 reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MAX,
2701 this->coord_dim, maxs, gmaxs
2707 global_box = rcp(
new mj_partBox_t(0,this->coord_dim,gmins,gmaxs));
2709 freeArray<mj_scalar_t>(mins);
2710 freeArray<mj_scalar_t>(gmins);
2711 freeArray<mj_scalar_t>(maxs);
2712 freeArray<mj_scalar_t>(gmaxs);
2720 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2722 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::init_part_boxes(
2723 RCP<mj_partBoxVector_t> & initial_partitioning_boxes
2726 mj_partBox_t tmp_box(*global_box);
2727 initial_partitioning_boxes->push_back(tmp_box);
2740 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2742 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_get_local_min_max_coord_totW(
2743 mj_lno_t coordinate_begin_index,
2744 mj_lno_t coordinate_end_index,
2745 mj_lno_t *mj_current_coordinate_permutations,
2746 mj_scalar_t *mj_current_dim_coords,
2747 mj_scalar_t &min_coordinate,
2748 mj_scalar_t &max_coordinate,
2749 mj_scalar_t &total_weight){
2753 if(coordinate_begin_index >= coordinate_end_index)
2755 min_coordinate = this->maxScalar_t;
2756 max_coordinate = this->minScalar_t;
2760 mj_scalar_t my_total_weight = 0;
2761 #ifdef HAVE_ZOLTAN2_OMP
2762 #pragma omp parallel num_threads(this->num_threads)
2766 if (this->mj_uniform_weights[0]) {
2767 #ifdef HAVE_ZOLTAN2_OMP
2771 my_total_weight = coordinate_end_index - coordinate_begin_index;
2777 #ifdef HAVE_ZOLTAN2_OMP
2778 #pragma omp for reduction(+:my_total_weight)
2780 for (mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
2781 int i = mj_current_coordinate_permutations[ii];
2782 my_total_weight += this->mj_weights[0][i];
2786 int my_thread_id = 0;
2787 #ifdef HAVE_ZOLTAN2_OMP
2788 my_thread_id = omp_get_thread_num();
2790 mj_scalar_t my_thread_min_coord, my_thread_max_coord;
2791 my_thread_min_coord=my_thread_max_coord
2792 =mj_current_dim_coords[mj_current_coordinate_permutations[coordinate_begin_index]];
2795 #ifdef HAVE_ZOLTAN2_OMP
2798 for(mj_lno_t j = coordinate_begin_index + 1; j < coordinate_end_index; ++j){
2799 int i = mj_current_coordinate_permutations[j];
2800 if(mj_current_dim_coords[i] > my_thread_max_coord)
2801 my_thread_max_coord = mj_current_dim_coords[i];
2802 if(mj_current_dim_coords[i] < my_thread_min_coord)
2803 my_thread_min_coord = mj_current_dim_coords[i];
2805 this->max_min_coords[my_thread_id] = my_thread_min_coord;
2806 this->max_min_coords[my_thread_id + this->num_threads] = my_thread_max_coord;
2808 #ifdef HAVE_ZOLTAN2_OMP
2811 #pragma omp single nowait
2814 min_coordinate = this->max_min_coords[0];
2815 for(
int i = 1; i < this->num_threads; ++i){
2816 if(this->max_min_coords[i] < min_coordinate)
2817 min_coordinate = this->max_min_coords[i];
2821 #ifdef HAVE_ZOLTAN2_OMP
2822 #pragma omp single nowait
2825 max_coordinate = this->max_min_coords[this->num_threads];
2826 for(
int i = this->num_threads + 1; i < this->num_threads * 2; ++i){
2827 if(this->max_min_coords[i] > max_coordinate)
2828 max_coordinate = this->max_min_coords[i];
2832 total_weight = my_total_weight;
2844 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2846 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_get_global_min_max_coord_totW(
2847 mj_part_t current_concurrent_num_parts,
2848 mj_scalar_t *local_min_max_total,
2849 mj_scalar_t *global_min_max_total){
2854 if(this->comm->getSize() > 1){
2857 current_concurrent_num_parts,
2858 current_concurrent_num_parts,
2859 current_concurrent_num_parts);
2861 reduceAll<int, mj_scalar_t>(
2864 3 * current_concurrent_num_parts,
2865 local_min_max_total,
2866 global_min_max_total);
2871 mj_part_t s = 3 * current_concurrent_num_parts;
2872 for (mj_part_t i = 0; i < s; ++i){
2873 global_min_max_total[i] = local_min_max_total[i];
2907 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2909 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_get_initial_cut_coords_target_weights(
2910 mj_scalar_t min_coord,
2911 mj_scalar_t max_coord,
2912 mj_part_t num_cuts ,
2913 mj_scalar_t global_weight,
2914 mj_scalar_t *initial_cut_coords ,
2915 mj_scalar_t *current_target_part_weights ,
2917 std::vector <mj_part_t> *future_num_part_in_parts,
2918 std::vector <mj_part_t> *next_future_num_parts_in_parts,
2919 mj_part_t concurrent_current_part,
2920 mj_part_t obtained_part_index,
2921 mj_part_t num_target_first_level_parts,
2922 const mj_part_t *target_first_level_dist) {
2924 mj_scalar_t coord_range = max_coord - min_coord;
2927 if (num_target_first_level_parts <= 1 &&
2928 this->mj_uniform_parts[0]) {
2930 mj_part_t cumulative = 0;
2933 mj_scalar_t total_future_part_count_in_part = mj_scalar_t((*future_num_part_in_parts)[concurrent_current_part]);
2936 mj_scalar_t unit_part_weight = global_weight / total_future_part_count_in_part;
2938 for (mj_part_t i = 0; i < num_cuts; ++i) {
2939 cumulative += (*next_future_num_parts_in_parts)[i + obtained_part_index];
2942 current_target_part_weights[i] = cumulative * unit_part_weight;
2945 initial_cut_coords[i] = min_coord + (coord_range * cumulative) / total_future_part_count_in_part;
2948 current_target_part_weights[num_cuts] = global_weight;
2952 if (this->mj_uniform_weights[0]) {
2953 for (mj_part_t i = 0; i < num_cuts + 1; ++i) {
2954 current_target_part_weights[i] = long(current_target_part_weights[i] + 0.5);
2959 else if(num_target_first_level_parts > 1 &&
2960 target_first_level_dist != NULL) {
2963 mj_part_t cumulative = 0.0;
2966 mj_scalar_t sum_target_first_level_dist = 0.0;
2968 for (
int i = 0; i < num_target_first_level_parts; ++i) {
2969 sum_target_first_level_dist += target_first_level_dist[i];
2972 for (mj_part_t i = 0; i < num_cuts; ++i) {
2973 cumulative += global_weight * target_first_level_dist[i] / sum_target_first_level_dist;
2976 current_target_part_weights[i] = cumulative;
2979 initial_cut_coords[i] = min_coord + (coord_range * cumulative) / global_weight;
2982 current_target_part_weights[num_cuts] = global_weight;
2986 for (mj_part_t i = 0; i < num_cuts + 1; ++i) {
2987 current_target_part_weights[i] = long(current_target_part_weights[i] + 0.5);
2991 std::cerr <<
"MJ does not support non uniform part weights beyond the first partition" << std::endl;
3009 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3011 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::set_initial_coordinate_parts(
3012 mj_scalar_t &max_coordinate,
3013 mj_scalar_t &min_coordinate,
3015 mj_lno_t coordinate_begin_index,
3016 mj_lno_t coordinate_end_index,
3017 mj_lno_t *mj_current_coordinate_permutations,
3018 mj_scalar_t *mj_current_dim_coords,
3019 mj_part_t *mj_part_ids,
3020 mj_part_t &partition_count
3022 mj_scalar_t coordinate_range = max_coordinate - min_coordinate;
3026 if(
ZOLTAN2_ABS(coordinate_range) < this->sEpsilon ){
3027 #ifdef HAVE_ZOLTAN2_OMP
3028 #pragma omp parallel for
3030 for(mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
3031 mj_part_ids[mj_current_coordinate_permutations[ii]] = 0;
3038 mj_scalar_t slice = coordinate_range / partition_count;
3040 #ifdef HAVE_ZOLTAN2_OMP
3041 #pragma omp parallel for
3043 for(mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
3045 mj_lno_t iii = mj_current_coordinate_permutations[ii];
3046 mj_part_t pp = mj_part_t((mj_current_dim_coords[iii] - min_coordinate) / slice);
3047 mj_part_ids[iii] = 2 * pp;
3063 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3065 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_1D_part(
3066 mj_scalar_t *mj_current_dim_coords,
3067 double used_imbalance_tolerance,
3068 mj_part_t current_work_part,
3069 mj_part_t current_concurrent_num_parts,
3070 mj_scalar_t *current_cut_coordinates,
3071 mj_part_t total_incomplete_cut_count,
3072 std::vector <mj_part_t> &num_partitioning_in_current_dim
3076 mj_part_t rectilinear_cut_count = 0;
3077 mj_scalar_t *temp_cut_coords = current_cut_coordinates;
3080 *reductionOp = NULL;
3082 <mj_part_t, mj_scalar_t>(
3083 &num_partitioning_in_current_dim ,
3085 current_concurrent_num_parts);
3087 size_t total_reduction_size = 0;
3088 #ifdef HAVE_ZOLTAN2_OMP
3089 #pragma omp parallel shared(total_incomplete_cut_count, rectilinear_cut_count) num_threads(this->num_threads)
3093 #ifdef HAVE_ZOLTAN2_OMP
3094 me = omp_get_thread_num();
3096 double *my_thread_part_weights = this->thread_part_weights[me];
3097 mj_scalar_t *my_thread_left_closest = this->thread_cut_left_closest_point[me];
3098 mj_scalar_t *my_thread_right_closest = this->thread_cut_right_closest_point[me];
3100 #ifdef HAVE_ZOLTAN2_OMP
3106 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
3108 mj_part_t num_part_in_dim = num_partitioning_in_current_dim[current_work_part + i];
3109 mj_part_t num_cut_in_dim = num_part_in_dim - 1;
3110 total_reduction_size += (4 * num_cut_in_dim + 1);
3112 for(mj_part_t ii = 0; ii < num_cut_in_dim; ++ii){
3113 this->is_cut_line_determined[next] =
false;
3114 this->cut_lower_bound_coordinates[next] = global_min_max_coord_total_weight[i];
3115 this->cut_upper_bound_coordinates[next] = global_min_max_coord_total_weight[i + current_concurrent_num_parts];
3117 this->cut_upper_bound_weights[next] = global_min_max_coord_total_weight[i + 2 * current_concurrent_num_parts];
3118 this->cut_lower_bound_weights[next] = 0;
3120 if(this->distribute_points_on_cut_lines){
3121 this->process_cut_line_weight_to_put_left[next] = 0;
3132 while (total_incomplete_cut_count != 0){
3134 mj_part_t concurrent_cut_shifts = 0;
3135 size_t total_part_shift = 0;
3137 for (mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
3138 mj_part_t num_parts = -1;
3139 num_parts = num_partitioning_in_current_dim[current_work_part + kk];
3141 mj_part_t num_cuts = num_parts - 1;
3142 size_t total_part_count = num_parts + size_t (num_cuts) ;
3143 if (this->my_incomplete_cut_count[kk] > 0){
3146 bool *current_cut_status = this->is_cut_line_determined + concurrent_cut_shifts;
3147 double *my_current_part_weights = my_thread_part_weights + total_part_shift;
3148 mj_scalar_t *my_current_left_closest = my_thread_left_closest + concurrent_cut_shifts;
3149 mj_scalar_t *my_current_right_closest = my_thread_right_closest + concurrent_cut_shifts;
3151 mj_part_t conccurent_current_part = current_work_part + kk;
3152 mj_lno_t coordinate_begin_index = conccurent_current_part ==0 ? 0: this->part_xadj[conccurent_current_part -1];
3153 mj_lno_t coordinate_end_index = this->part_xadj[conccurent_current_part];
3154 mj_scalar_t *temp_current_cut_coords = temp_cut_coords + concurrent_cut_shifts;
3156 mj_scalar_t min_coord = global_min_max_coord_total_weight[kk];
3157 mj_scalar_t max_coord = global_min_max_coord_total_weight[kk + current_concurrent_num_parts];
3160 this->mj_1D_part_get_thread_part_weights(
3165 coordinate_begin_index,
3166 coordinate_end_index,
3167 mj_current_dim_coords,
3168 temp_current_cut_coords,
3170 my_current_part_weights,
3171 my_current_left_closest,
3172 my_current_right_closest);
3176 concurrent_cut_shifts += num_cuts;
3177 total_part_shift += total_part_count;
3181 this->mj_accumulate_thread_results(
3182 num_partitioning_in_current_dim,
3184 current_concurrent_num_parts);
3187 #ifdef HAVE_ZOLTAN2_OMP
3191 if(this->comm->getSize() > 1){
3192 reduceAll<int, mj_scalar_t>( *(this->comm), *reductionOp,
3193 total_reduction_size,
3194 this->total_part_weight_left_right_closests,
3195 this->global_total_part_weight_left_right_closests);
3200 this->global_total_part_weight_left_right_closests,
3201 this->total_part_weight_left_right_closests,
3202 total_reduction_size *
sizeof(mj_scalar_t));
3207 mj_part_t cut_shift = 0;
3210 size_t tlr_shift = 0;
3211 for (mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
3212 mj_part_t num_parts = num_partitioning_in_current_dim[current_work_part + kk];
3213 mj_part_t num_cuts = num_parts - 1;
3214 size_t num_total_part = num_parts + size_t (num_cuts) ;
3219 if (this->my_incomplete_cut_count[kk] == 0) {
3220 cut_shift += num_cuts;
3221 tlr_shift += (num_total_part + 2 * num_cuts);
3225 mj_scalar_t *current_local_part_weights = this->total_part_weight_left_right_closests + tlr_shift ;
3226 mj_scalar_t *current_global_tlr = this->global_total_part_weight_left_right_closests + tlr_shift;
3227 mj_scalar_t *current_global_left_closest_points = current_global_tlr + num_total_part;
3228 mj_scalar_t *current_global_right_closest_points = current_global_tlr + num_total_part + num_cuts;
3229 mj_scalar_t *current_global_part_weights = current_global_tlr;
3230 bool *current_cut_line_determined = this->is_cut_line_determined + cut_shift;
3232 mj_scalar_t *current_part_target_weights = this->target_part_weights + cut_shift + kk;
3233 mj_scalar_t *current_part_cut_line_weight_to_put_left = this->process_cut_line_weight_to_put_left + cut_shift;
3235 mj_scalar_t min_coordinate = global_min_max_coord_total_weight[kk];
3236 mj_scalar_t max_coordinate = global_min_max_coord_total_weight[kk + current_concurrent_num_parts];
3237 mj_scalar_t global_total_weight = global_min_max_coord_total_weight[kk + current_concurrent_num_parts * 2];
3238 mj_scalar_t *current_cut_lower_bound_weights = this->cut_lower_bound_weights + cut_shift;
3239 mj_scalar_t *current_cut_upper_weights = this->cut_upper_bound_weights + cut_shift;
3240 mj_scalar_t *current_cut_upper_bounds = this->cut_upper_bound_coordinates + cut_shift;
3241 mj_scalar_t *current_cut_lower_bounds = this->cut_lower_bound_coordinates + cut_shift;
3243 mj_part_t initial_incomplete_cut_count = this->my_incomplete_cut_count[kk];
3246 this->mj_get_new_cut_coordinates(
3251 global_total_weight,
3252 used_imbalance_tolerance,
3253 current_global_part_weights,
3254 current_local_part_weights,
3255 current_part_target_weights,
3256 current_cut_line_determined,
3257 temp_cut_coords + cut_shift,
3258 current_cut_upper_bounds,
3259 current_cut_lower_bounds,
3260 current_global_left_closest_points,
3261 current_global_right_closest_points,
3262 current_cut_lower_bound_weights,
3263 current_cut_upper_weights,
3264 this->cut_coordinates_work_array +cut_shift,
3265 current_part_cut_line_weight_to_put_left,
3266 &rectilinear_cut_count,
3267 this->my_incomplete_cut_count[kk]);
3269 cut_shift += num_cuts;
3270 tlr_shift += (num_total_part + 2 * num_cuts);
3271 mj_part_t iteration_complete_cut_count = initial_incomplete_cut_count - this->my_incomplete_cut_count[kk];
3272 #ifdef HAVE_ZOLTAN2_OMP
3276 total_incomplete_cut_count -= iteration_complete_cut_count;
3281 #ifdef HAVE_ZOLTAN2_OMP
3287 mj_scalar_t *t = temp_cut_coords;
3288 temp_cut_coords = this->cut_coordinates_work_array;
3289 this->cut_coordinates_work_array = t;
3300 if (current_cut_coordinates != temp_cut_coords){
3301 #ifdef HAVE_ZOLTAN2_OMP
3306 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
3307 mj_part_t num_parts = -1;
3308 num_parts = num_partitioning_in_current_dim[current_work_part + i];
3309 mj_part_t num_cuts = num_parts - 1;
3311 for(mj_part_t ii = 0; ii < num_cuts; ++ii){
3312 current_cut_coordinates[next + ii] = temp_cut_coords[next + ii];
3318 #ifdef HAVE_ZOLTAN2_OMP
3322 this->cut_coordinates_work_array = temp_cut_coords;
3349 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3351 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_1D_part_get_thread_part_weights(
3352 size_t total_part_count,
3354 mj_scalar_t max_coord,
3355 mj_scalar_t min_coord,
3356 mj_lno_t coordinate_begin_index,
3357 mj_lno_t coordinate_end_index,
3358 mj_scalar_t *mj_current_dim_coords,
3359 mj_scalar_t *temp_current_cut_coords,
3361 double *my_current_part_weights,
3362 mj_scalar_t *my_current_left_closest,
3363 mj_scalar_t *my_current_right_closest){
3366 for (
size_t i = 0; i < total_part_count; ++i){
3367 my_current_part_weights[i] = 0;
3372 for(mj_part_t i = 0; i < num_cuts; ++i){
3373 my_current_left_closest[i] = min_coord - 1;
3374 my_current_right_closest[i] = max_coord + 1;
3377 mj_scalar_t minus_EPSILON = -this->sEpsilon;
3378 #ifdef HAVE_ZOLTAN2_OMP
3384 for (mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
3385 int i = this->coordinate_permutations[ii];
3389 mj_part_t j = this->assigned_part_ids[i] / 2;
3395 mj_part_t lower_cut_index = 0;
3396 mj_part_t upper_cut_index = num_cuts - 1;
3398 mj_scalar_t w = this->mj_uniform_weights[0]? 1:this->mj_weights[0][i];
3399 bool is_inserted =
false;
3400 bool is_on_left_of_cut =
false;
3401 bool is_on_right_of_cut =
false;
3402 mj_part_t last_compared_part = -1;
3404 mj_scalar_t coord = mj_current_dim_coords[i];
3406 while(upper_cut_index >= lower_cut_index)
3409 last_compared_part = -1;
3410 is_on_left_of_cut =
false;
3411 is_on_right_of_cut =
false;
3412 mj_scalar_t cut = temp_current_cut_coords[j];
3413 mj_scalar_t distance_to_cut = coord - cut;
3414 mj_scalar_t abs_distance_to_cut =
ZOLTAN2_ABS(distance_to_cut);
3417 if(abs_distance_to_cut < this->sEpsilon){
3419 my_current_part_weights[j * 2 + 1] += w;
3420 this->assigned_part_ids[i] = j * 2 + 1;
3423 my_current_left_closest[j] = coord;
3424 my_current_right_closest[j] = coord;
3427 mj_part_t kk = j + 1;
3428 while(kk < num_cuts){
3430 distance_to_cut =
ZOLTAN2_ABS(temp_current_cut_coords[kk] - cut);
3431 if(distance_to_cut < this->sEpsilon){
3432 my_current_part_weights[2 * kk + 1] += w;
3433 my_current_left_closest[kk] = coord;
3434 my_current_right_closest[kk] = coord;
3440 if(coord - my_current_left_closest[kk] > this->sEpsilon){
3441 my_current_left_closest[kk] = coord;
3451 distance_to_cut =
ZOLTAN2_ABS(temp_current_cut_coords[kk] - cut);
3452 if(distance_to_cut < this->sEpsilon){
3453 my_current_part_weights[2 * kk + 1] += w;
3455 this->assigned_part_ids[i] = kk * 2 + 1;
3456 my_current_left_closest[kk] = coord;
3457 my_current_right_closest[kk] = coord;
3463 if(my_current_right_closest[kk] - coord > this->sEpsilon){
3464 my_current_right_closest[kk] = coord;
3475 if (distance_to_cut < 0) {
3476 bool _break =
false;
3480 mj_scalar_t distance_to_next_cut = coord - temp_current_cut_coords[j - 1];
3481 if(distance_to_next_cut > this->sEpsilon){
3487 upper_cut_index = j - 1;
3489 is_on_left_of_cut =
true;
3490 last_compared_part = j;
3495 bool _break =
false;
3496 if(j < num_cuts - 1){
3499 mj_scalar_t distance_to_next_cut = coord - temp_current_cut_coords[j + 1];
3500 if(distance_to_next_cut < minus_EPSILON){
3507 lower_cut_index = j + 1;
3509 is_on_right_of_cut =
true;
3510 last_compared_part = j;
3515 j = (upper_cut_index + lower_cut_index) / 2;
3518 if(is_on_right_of_cut){
3521 my_current_part_weights[2 * last_compared_part + 2] += w;
3522 this->assigned_part_ids[i] = 2 * last_compared_part + 2;
3525 if(my_current_right_closest[last_compared_part] - coord > this->sEpsilon){
3526 my_current_right_closest[last_compared_part] = coord;
3529 if(last_compared_part+1 < num_cuts){
3531 if(coord - my_current_left_closest[last_compared_part + 1] > this->sEpsilon){
3532 my_current_left_closest[last_compared_part + 1] = coord;
3537 else if(is_on_left_of_cut){
3540 my_current_part_weights[2 * last_compared_part] += w;
3541 this->assigned_part_ids[i] = 2 * last_compared_part;
3545 if(coord - my_current_left_closest[last_compared_part] > this->sEpsilon){
3546 my_current_left_closest[last_compared_part] = coord;
3550 if(last_compared_part-1 >= 0){
3551 if(my_current_right_closest[last_compared_part -1] - coord > this->sEpsilon){
3552 my_current_right_closest[last_compared_part -1] = coord;
3561 for (
size_t i = 1; i < total_part_count; ++i){
3565 if(i % 2 == 0 && i > 1 && i < total_part_count - 1 &&
3566 ZOLTAN2_ABS(temp_current_cut_coords[i / 2] - temp_current_cut_coords[i /2 - 1])
3571 my_current_part_weights[i] = my_current_part_weights[i-2];
3575 my_current_part_weights[i] += my_current_part_weights[i-1];
3587 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3589 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_accumulate_thread_results(
3590 const std::vector <mj_part_t> &num_partitioning_in_current_dim,
3591 mj_part_t current_work_part,
3592 mj_part_t current_concurrent_num_parts){
3594 #ifdef HAVE_ZOLTAN2_OMP
3601 size_t tlr_array_shift = 0;
3602 mj_part_t cut_shift = 0;
3605 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
3607 mj_part_t num_parts_in_part = num_partitioning_in_current_dim[current_work_part + i];
3608 mj_part_t num_cuts_in_part = num_parts_in_part - 1;
3609 size_t num_total_part_in_part = num_parts_in_part + size_t (num_cuts_in_part) ;
3612 for(mj_part_t ii = 0; ii < num_cuts_in_part ; ++ii){
3613 mj_part_t next = tlr_array_shift + ii;
3614 mj_part_t cut_index = cut_shift + ii;
3615 if(this->is_cut_line_determined[cut_index])
continue;
3616 mj_scalar_t left_closest_in_process = this->thread_cut_left_closest_point[0][cut_index],
3617 right_closest_in_process = this->thread_cut_right_closest_point[0][cut_index];
3620 for (
int j = 1; j < this->num_threads; ++j){
3621 if (this->thread_cut_right_closest_point[j][cut_index] < right_closest_in_process ){
3622 right_closest_in_process = this->thread_cut_right_closest_point[j][cut_index];
3624 if (this->thread_cut_left_closest_point[j][cut_index] > left_closest_in_process ){
3625 left_closest_in_process = this->thread_cut_left_closest_point[j][cut_index];
3629 this->total_part_weight_left_right_closests[num_total_part_in_part +
3630 next] = left_closest_in_process;
3631 this->total_part_weight_left_right_closests[num_total_part_in_part +
3632 num_cuts_in_part + next] = right_closest_in_process;
3635 tlr_array_shift += (num_total_part_in_part + 2 * num_cuts_in_part);
3636 cut_shift += num_cuts_in_part;
3639 tlr_array_shift = 0;
3641 size_t total_part_array_shift = 0;
3644 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
3646 mj_part_t num_parts_in_part = num_partitioning_in_current_dim[current_work_part + i];
3647 mj_part_t num_cuts_in_part = num_parts_in_part - 1;
3648 size_t num_total_part_in_part = num_parts_in_part + size_t (num_cuts_in_part) ;
3650 for(
size_t j = 0; j < num_total_part_in_part; ++j){
3652 mj_part_t cut_ind = j / 2 + cut_shift;
3657 if(j != num_total_part_in_part - 1 && this->is_cut_line_determined[cut_ind])
continue;
3659 for (
int k = 0; k < this->num_threads; ++k){
3660 pwj += this->thread_part_weights[k][total_part_array_shift + j];
3663 this->total_part_weight_left_right_closests[tlr_array_shift + j] = pwj;
3665 cut_shift += num_cuts_in_part;
3666 tlr_array_shift += num_total_part_in_part + 2 * num_cuts_in_part;
3667 total_part_array_shift += num_total_part_in_part;
3685 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3687 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_calculate_new_cut_position (
3688 mj_scalar_t cut_upper_bound,
3689 mj_scalar_t cut_lower_bound,
3690 mj_scalar_t cut_upper_weight,
3691 mj_scalar_t cut_lower_weight,
3692 mj_scalar_t expected_weight,
3693 mj_scalar_t &new_cut_position){
3695 if(
ZOLTAN2_ABS(cut_upper_bound - cut_lower_bound) < this->sEpsilon){
3696 new_cut_position = cut_upper_bound;
3700 if(
ZOLTAN2_ABS(cut_upper_weight - cut_lower_weight) < this->sEpsilon){
3701 new_cut_position = cut_lower_bound;
3704 mj_scalar_t coordinate_range = (cut_upper_bound - cut_lower_bound);
3705 mj_scalar_t weight_range = (cut_upper_weight - cut_lower_weight);
3706 mj_scalar_t my_weight_diff = (expected_weight - cut_lower_weight);
3708 mj_scalar_t required_shift = (my_weight_diff / weight_range);
3709 int scale_constant = 20;
3710 int shiftint= int (required_shift * scale_constant);
3711 if (shiftint == 0) shiftint = 1;
3712 required_shift = mj_scalar_t (shiftint) / scale_constant;
3713 new_cut_position = coordinate_range * required_shift + cut_lower_bound;
3727 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3729 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_create_new_partitions(
3730 mj_part_t num_parts,
3732 mj_scalar_t *current_concurrent_cut_coordinate,
3733 mj_lno_t coordinate_begin,
3734 mj_lno_t coordinate_end,
3735 mj_scalar_t *used_local_cut_line_weight_to_left,
3736 double **used_thread_part_weight_work,
3737 mj_lno_t *out_part_xadj){
3739 mj_part_t num_cuts = num_parts - 1;
3741 #ifdef HAVE_ZOLTAN2_OMP
3742 #pragma omp parallel
3746 #ifdef HAVE_ZOLTAN2_OMP
3747 me = omp_get_thread_num();
3750 mj_lno_t *thread_num_points_in_parts = this->thread_point_counts[me];
3751 mj_scalar_t *my_local_thread_cut_weights_to_put_left = NULL;
3755 if (this->distribute_points_on_cut_lines){
3756 my_local_thread_cut_weights_to_put_left = this->thread_cut_line_weight_to_put_left[me];
3758 #ifdef HAVE_ZOLTAN2_OMP
3761 for (mj_part_t i = 0; i < num_cuts; ++i){
3763 mj_scalar_t left_weight = used_local_cut_line_weight_to_left[i];
3764 for(
int ii = 0; ii < this->num_threads; ++ii){
3765 if(left_weight > this->sEpsilon){
3767 mj_scalar_t thread_ii_weight_on_cut = used_thread_part_weight_work[ii][i * 2 + 1] - used_thread_part_weight_work[ii][i * 2 ];
3768 if(thread_ii_weight_on_cut < left_weight){
3770 this->thread_cut_line_weight_to_put_left[ii][i] = thread_ii_weight_on_cut;
3774 this->thread_cut_line_weight_to_put_left[ii][i] = left_weight ;
3776 left_weight -= thread_ii_weight_on_cut;
3779 this->thread_cut_line_weight_to_put_left[ii][i] = 0;
3787 for (mj_part_t i = num_cuts - 1; i > 0 ; --i){
3788 if(
ZOLTAN2_ABS(current_concurrent_cut_coordinate[i] - current_concurrent_cut_coordinate[i -1]) < this->sEpsilon){
3789 my_local_thread_cut_weights_to_put_left[i] -= my_local_thread_cut_weights_to_put_left[i - 1] ;
3797 for(mj_part_t ii = 0; ii < num_parts; ++ii){
3798 thread_num_points_in_parts[ii] = 0;
3802 #ifdef HAVE_ZOLTAN2_OMP
3806 for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
3808 mj_lno_t coordinate_index = this->coordinate_permutations[ii];
3809 mj_scalar_t coordinate_weight = this->mj_uniform_weights[0]? 1:this->mj_weights[0][coordinate_index];
3810 mj_part_t coordinate_assigned_place = this->assigned_part_ids[coordinate_index];
3811 mj_part_t coordinate_assigned_part = coordinate_assigned_place / 2;
3812 if(coordinate_assigned_place % 2 == 1){
3814 if(this->distribute_points_on_cut_lines
3815 && my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] > this->sEpsilon){
3819 my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] -= coordinate_weight;
3824 if(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] < 0
3825 && coordinate_assigned_part < num_cuts - 1
3826 &&
ZOLTAN2_ABS(current_concurrent_cut_coordinate[coordinate_assigned_part+1] -
3827 current_concurrent_cut_coordinate[coordinate_assigned_part]) < this->sEpsilon){
3828 my_local_thread_cut_weights_to_put_left[coordinate_assigned_part + 1] += my_local_thread_cut_weights_to_put_left[coordinate_assigned_part];
3830 ++thread_num_points_in_parts[coordinate_assigned_part];
3831 this->assigned_part_ids[coordinate_index] = coordinate_assigned_part;
3835 ++coordinate_assigned_part;
3837 while(this->distribute_points_on_cut_lines &&
3838 coordinate_assigned_part < num_cuts){
3840 if(
ZOLTAN2_ABS(current_concurrent_cut_coordinate[coordinate_assigned_part] -
3841 current_concurrent_cut_coordinate[coordinate_assigned_part - 1])
3844 if(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] >
3846 my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] >=
3847 ZOLTAN2_ABS(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] - coordinate_weight)){
3848 my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] -= coordinate_weight;
3851 if(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] < 0 &&
3852 coordinate_assigned_part < num_cuts - 1 &&
3853 ZOLTAN2_ABS(current_concurrent_cut_coordinate[coordinate_assigned_part+1] -
3854 current_concurrent_cut_coordinate[coordinate_assigned_part]) < this->sEpsilon){
3855 my_local_thread_cut_weights_to_put_left[coordinate_assigned_part + 1] += my_local_thread_cut_weights_to_put_left[coordinate_assigned_part];
3863 ++coordinate_assigned_part;
3865 ++thread_num_points_in_parts[coordinate_assigned_part];
3866 this->assigned_part_ids[coordinate_index] = coordinate_assigned_part;
3871 ++thread_num_points_in_parts[coordinate_assigned_part];
3872 this->assigned_part_ids[coordinate_index] = coordinate_assigned_part;
3881 #ifdef HAVE_ZOLTAN2_OMP
3884 for(mj_part_t j = 0; j < num_parts; ++j){
3885 mj_lno_t num_points_in_part_j_upto_thread_i = 0;
3886 for (
int i = 0; i < this->num_threads; ++i){
3887 mj_lno_t thread_num_points_in_part_j = this->thread_point_counts[i][j];
3889 this->thread_point_counts[i][j] = num_points_in_part_j_upto_thread_i;
3890 num_points_in_part_j_upto_thread_i += thread_num_points_in_part_j;
3893 out_part_xadj[j] = num_points_in_part_j_upto_thread_i;
3897 #ifdef HAVE_ZOLTAN2_OMP
3902 for(mj_part_t j = 1; j < num_parts; ++j){
3903 out_part_xadj[j] += out_part_xadj[j - 1];
3909 for(mj_part_t j = 1; j < num_parts; ++j){
3910 thread_num_points_in_parts[j] += out_part_xadj[j - 1] ;
3916 #ifdef HAVE_ZOLTAN2_OMP
3919 for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
3920 mj_lno_t i = this->coordinate_permutations[ii];
3921 mj_part_t p = this->assigned_part_ids[i];
3922 this->new_coordinate_permutations[coordinate_begin +
3923 thread_num_points_in_parts[p]++] = i;
3958 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3960 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_get_new_cut_coordinates(
3962 const mj_part_t &num_cuts,
3963 const mj_scalar_t &max_coordinate,
3964 const mj_scalar_t &min_coordinate,
3965 const mj_scalar_t &global_total_weight,
3966 const double &used_imbalance_tolerance,
3967 mj_scalar_t * current_global_part_weights,
3968 const mj_scalar_t * current_local_part_weights,
3969 const mj_scalar_t *current_part_target_weights,
3970 bool *current_cut_line_determined,
3971 mj_scalar_t *current_cut_coordinates,
3972 mj_scalar_t *current_cut_upper_bounds,
3973 mj_scalar_t *current_cut_lower_bounds,
3974 mj_scalar_t *current_global_left_closest_points,
3975 mj_scalar_t *current_global_right_closest_points,
3976 mj_scalar_t * current_cut_lower_bound_weights,
3977 mj_scalar_t * current_cut_upper_weights,
3978 mj_scalar_t *new_current_cut_coordinates,
3979 mj_scalar_t *current_part_cut_line_weight_to_put_left,
3980 mj_part_t *rectilinear_cut_count,
3981 mj_part_t &my_num_incomplete_cut){
3984 mj_scalar_t seen_weight_in_part = 0;
3986 mj_scalar_t expected_weight_in_part = 0;
3988 double imbalance_on_left = 0, imbalance_on_right = 0;
3991 #ifdef HAVE_ZOLTAN2_OMP
3994 for (mj_part_t i = 0; i < num_cuts; i++){
3997 if(min_coordinate - current_global_left_closest_points[i] > this->sEpsilon)
3998 current_global_left_closest_points[i] = current_cut_coordinates[i];
3999 if(current_global_right_closest_points[i] - max_coordinate > this->sEpsilon)
4000 current_global_right_closest_points[i] = current_cut_coordinates[i];
4003 #ifdef HAVE_ZOLTAN2_OMP
4006 for (mj_part_t i = 0; i < num_cuts; i++){
4008 if(this->distribute_points_on_cut_lines){
4010 this->global_rectilinear_cut_weight[i] = 0;
4011 this->process_rectilinear_cut_weight[i] = 0;
4015 if(current_cut_line_determined[i]) {
4016 new_current_cut_coordinates[i] = current_cut_coordinates[i];
4021 seen_weight_in_part = current_global_part_weights[i * 2];
4030 expected_weight_in_part = current_part_target_weights[i];
4032 imbalance_on_left =
imbalanceOf2(seen_weight_in_part, expected_weight_in_part);
4034 imbalance_on_right =
imbalanceOf2(global_total_weight - seen_weight_in_part, global_total_weight - expected_weight_in_part);
4036 bool is_left_imbalance_valid =
ZOLTAN2_ABS(imbalance_on_left) - used_imbalance_tolerance < this->sEpsilon ;
4037 bool is_right_imbalance_valid =
ZOLTAN2_ABS(imbalance_on_right) - used_imbalance_tolerance < this->sEpsilon;
4040 if(is_left_imbalance_valid && is_right_imbalance_valid){
4041 current_cut_line_determined[i] =
true;
4042 #ifdef HAVE_ZOLTAN2_OMP
4045 my_num_incomplete_cut -= 1;
4046 new_current_cut_coordinates [i] = current_cut_coordinates[i];
4049 else if(imbalance_on_left < 0){
4052 if(this->distribute_points_on_cut_lines){
4057 if (current_global_part_weights[i * 2 + 1] == expected_weight_in_part){
4059 current_cut_line_determined[i] =
true;
4060 #ifdef HAVE_ZOLTAN2_OMP
4063 my_num_incomplete_cut -= 1;
4066 new_current_cut_coordinates [i] = current_cut_coordinates[i];
4070 current_part_cut_line_weight_to_put_left[i] = current_local_part_weights[i * 2 + 1] - current_local_part_weights[i * 2];
4073 else if (current_global_part_weights[i * 2 + 1] > expected_weight_in_part){
4077 current_cut_line_determined[i] =
true;
4078 #ifdef HAVE_ZOLTAN2_OMP
4081 *rectilinear_cut_count += 1;
4084 #ifdef HAVE_ZOLTAN2_OMP
4087 my_num_incomplete_cut -= 1;
4088 new_current_cut_coordinates [i] = current_cut_coordinates[i];
4089 this->process_rectilinear_cut_weight[i] = current_local_part_weights[i * 2 + 1] -
4090 current_local_part_weights[i * 2];
4095 current_cut_lower_bounds[i] = current_global_right_closest_points[i];
4097 current_cut_lower_bound_weights[i] = seen_weight_in_part;
4101 for (mj_part_t ii = i + 1; ii < num_cuts ; ++ii){
4102 mj_scalar_t p_weight = current_global_part_weights[ii * 2];
4103 mj_scalar_t line_weight = current_global_part_weights[ii * 2 + 1];
4105 if(p_weight >= expected_weight_in_part){
4110 if(p_weight == expected_weight_in_part){
4111 current_cut_upper_bounds[i] = current_cut_coordinates[ii];
4112 current_cut_upper_weights[i] = p_weight;
4113 current_cut_lower_bounds[i] = current_cut_coordinates[ii];
4114 current_cut_lower_bound_weights[i] = p_weight;
4115 }
else if (p_weight < current_cut_upper_weights[i]){
4118 current_cut_upper_bounds[i] = current_global_left_closest_points[ii];
4119 current_cut_upper_weights[i] = p_weight;
4125 if(line_weight >= expected_weight_in_part){
4128 current_cut_upper_bounds[i] = current_cut_coordinates[ii];
4129 current_cut_upper_weights[i] = line_weight;
4130 current_cut_lower_bounds[i] = current_cut_coordinates[ii];
4131 current_cut_lower_bound_weights[i] = p_weight;
4136 if (p_weight <= expected_weight_in_part && p_weight >= current_cut_lower_bound_weights[i]){
4137 current_cut_lower_bounds[i] = current_global_right_closest_points[ii] ;
4138 current_cut_lower_bound_weights[i] = p_weight;
4143 mj_scalar_t new_cut_position = 0;
4144 this->mj_calculate_new_cut_position(
4145 current_cut_upper_bounds[i],
4146 current_cut_lower_bounds[i],
4147 current_cut_upper_weights[i],
4148 current_cut_lower_bound_weights[i],
4149 expected_weight_in_part, new_cut_position);
4153 if (
ZOLTAN2_ABS(current_cut_coordinates[i] - new_cut_position) < this->sEpsilon
4156 current_cut_line_determined[i] =
true;
4157 #ifdef HAVE_ZOLTAN2_OMP
4160 my_num_incomplete_cut -= 1;
4163 new_current_cut_coordinates [i] = current_cut_coordinates[i];
4165 new_current_cut_coordinates [i] = new_cut_position;
4171 current_cut_upper_bounds[i] = current_global_left_closest_points[i];
4172 current_cut_upper_weights[i] = seen_weight_in_part;
4175 for (
int ii = i - 1; ii >= 0; --ii){
4176 mj_scalar_t p_weight = current_global_part_weights[ii * 2];
4177 mj_scalar_t line_weight = current_global_part_weights[ii * 2 + 1];
4178 if(p_weight <= expected_weight_in_part){
4179 if(p_weight == expected_weight_in_part){
4182 current_cut_upper_bounds[i] = current_cut_coordinates[ii];
4183 current_cut_upper_weights[i] = p_weight;
4184 current_cut_lower_bounds[i] = current_cut_coordinates[ii];
4185 current_cut_lower_bound_weights[i] = p_weight;
4187 else if (p_weight > current_cut_lower_bound_weights[i]){
4190 current_cut_lower_bounds[i] = current_global_right_closest_points[ii];
4191 current_cut_lower_bound_weights[i] = p_weight;
4197 if(line_weight > expected_weight_in_part){
4198 current_cut_upper_bounds[i] = current_global_right_closest_points[ii];
4199 current_cut_upper_weights[i] = line_weight;
4208 if (p_weight >= expected_weight_in_part &&
4209 (p_weight < current_cut_upper_weights[i] ||
4210 (p_weight == current_cut_upper_weights[i] &&
4211 current_cut_upper_bounds[i] > current_global_left_closest_points[ii]
4215 current_cut_upper_bounds[i] = current_global_left_closest_points[ii] ;
4216 current_cut_upper_weights[i] = p_weight;
4219 mj_scalar_t new_cut_position = 0;
4220 this->mj_calculate_new_cut_position(
4221 current_cut_upper_bounds[i],
4222 current_cut_lower_bounds[i],
4223 current_cut_upper_weights[i],
4224 current_cut_lower_bound_weights[i],
4225 expected_weight_in_part,
4229 if (
ZOLTAN2_ABS(current_cut_coordinates[i] - new_cut_position) < this->sEpsilon
4231 current_cut_line_determined[i] =
true;
4232 #ifdef HAVE_ZOLTAN2_OMP
4235 my_num_incomplete_cut -= 1;
4237 new_current_cut_coordinates [ i] = current_cut_coordinates[i];
4239 new_current_cut_coordinates [ i] = new_cut_position;
4248 #ifdef HAVE_ZOLTAN2_OMP
4253 if(*rectilinear_cut_count > 0){
4256 Teuchos::scan<int,mj_scalar_t>(
4257 *comm, Teuchos::REDUCE_SUM,
4259 this->process_rectilinear_cut_weight,
4260 this->global_rectilinear_cut_weight
4265 for (mj_part_t i = 0; i < num_cuts; ++i){
4267 if(this->global_rectilinear_cut_weight[i] > 0) {
4269 mj_scalar_t expected_part_weight = current_part_target_weights[i];
4271 mj_scalar_t necessary_weight_on_line_for_left = expected_part_weight - current_global_part_weights[i * 2];
4273 mj_scalar_t my_weight_on_line = this->process_rectilinear_cut_weight[i];
4275 mj_scalar_t weight_on_line_upto_process_inclusive = this->global_rectilinear_cut_weight[i];
4278 mj_scalar_t space_to_put_left = necessary_weight_on_line_for_left - weight_on_line_upto_process_inclusive;
4280 mj_scalar_t space_left_to_me = space_to_put_left + my_weight_on_line;
4290 if(space_left_to_me < 0){
4292 current_part_cut_line_weight_to_put_left[i] = 0;
4294 else if(space_left_to_me >= my_weight_on_line){
4297 current_part_cut_line_weight_to_put_left[i] = my_weight_on_line;
4302 current_part_cut_line_weight_to_put_left[i] = space_left_to_me ;
4309 *rectilinear_cut_count = 0;
4324 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4326 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::get_processor_num_points_in_parts(
4327 mj_part_t num_procs,
4328 mj_part_t num_parts,
4329 mj_gno_t *&num_points_in_all_processor_parts){
4332 size_t allocation_size = num_parts * (num_procs + 1);
4339 mj_gno_t *num_local_points_in_each_part_to_reduce_sum = allocMemory<mj_gno_t>(allocation_size);
4344 mj_gno_t *my_local_points_to_reduce_sum = num_local_points_in_each_part_to_reduce_sum + num_procs * num_parts;
4347 mj_gno_t *my_local_point_counts_in_each_art = num_local_points_in_each_part_to_reduce_sum + this->myRank * num_parts;
4350 memset(num_local_points_in_each_part_to_reduce_sum, 0,
sizeof(mj_gno_t)*allocation_size);
4353 for (mj_part_t i = 0; i < num_parts; ++i){
4354 mj_lno_t part_begin_index = 0;
4356 part_begin_index = this->new_part_xadj[i - 1];
4358 mj_lno_t part_end_index = this->new_part_xadj[i];
4359 my_local_points_to_reduce_sum[i] = part_end_index - part_begin_index;
4364 memcpy (my_local_point_counts_in_each_art,
4365 my_local_points_to_reduce_sum,
4366 sizeof(mj_gno_t) * (num_parts) );
4374 reduceAll<int, mj_gno_t>(
4376 Teuchos::REDUCE_SUM,
4378 num_local_points_in_each_part_to_reduce_sum,
4379 num_points_in_all_processor_parts);
4382 freeArray<mj_gno_t>(num_local_points_in_each_part_to_reduce_sum);
4399 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4401 bool AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_check_to_migrate(
4402 size_t migration_reduce_all_population,
4403 mj_lno_t num_coords_for_last_dim_part,
4404 mj_part_t num_procs,
4405 mj_part_t num_parts,
4406 mj_gno_t *num_points_in_all_processor_parts){
4414 if (this->check_migrate_avoid_migration_option == 0){
4415 double global_imbalance = 0;
4417 size_t global_shift = num_procs * num_parts;
4419 for (mj_part_t ii = 0; ii < num_procs; ++ii){
4420 for (mj_part_t i = 0; i < num_parts; ++i){
4421 double ideal_num = num_points_in_all_processor_parts[global_shift + i]
4422 / double(num_procs);
4425 num_points_in_all_processor_parts[ii * num_parts + i]) / (ideal_num);
4428 global_imbalance /= num_parts;
4429 global_imbalance /= num_procs;
4437 if(global_imbalance <= this->minimum_migration_imbalance){
4460 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4462 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::assign_send_destinations(
4463 mj_part_t num_parts,
4464 mj_part_t *part_assignment_proc_begin_indices,
4465 mj_part_t *processor_chains_in_parts,
4466 mj_lno_t *send_count_to_each_proc,
4467 int *coordinate_destinations){
4469 for (mj_part_t p = 0; p < num_parts; ++p){
4470 mj_lno_t part_begin = 0;
4471 if (p > 0) part_begin = this->new_part_xadj[p - 1];
4472 mj_lno_t part_end = this->new_part_xadj[p];
4475 mj_part_t proc_to_sent = part_assignment_proc_begin_indices[p];
4477 mj_lno_t num_total_send = 0;
4478 for (mj_lno_t j=part_begin; j < part_end; j++){
4479 mj_lno_t local_ind = this->new_coordinate_permutations[j];
4480 while (num_total_send >= send_count_to_each_proc[proc_to_sent]){
4484 part_assignment_proc_begin_indices[p] = processor_chains_in_parts[proc_to_sent];
4486 processor_chains_in_parts[proc_to_sent] = -1;
4488 proc_to_sent = part_assignment_proc_begin_indices[p];
4491 coordinate_destinations[local_ind] = proc_to_sent;
4511 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4513 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_assign_proc_to_parts(
4514 mj_gno_t * num_points_in_all_processor_parts,
4515 mj_part_t num_parts,
4516 mj_part_t num_procs,
4517 mj_lno_t *send_count_to_each_proc,
4518 std::vector<mj_part_t> &processor_ranks_for_subcomm,
4519 std::vector<mj_part_t> *next_future_num_parts_in_parts,
4520 mj_part_t &out_part_index,
4521 mj_part_t &output_part_numbering_begin_index,
4522 int *coordinate_destinations){
4525 mj_gno_t *global_num_points_in_parts = num_points_in_all_processor_parts + num_procs * num_parts;
4526 mj_part_t *num_procs_assigned_to_each_part = allocMemory<mj_part_t>(num_parts);
4529 bool did_i_find_my_group =
false;
4531 mj_part_t num_free_procs = num_procs;
4532 mj_part_t minimum_num_procs_required_for_rest_of_parts = num_parts - 1;
4534 double max_imbalance_difference = 0;
4535 mj_part_t max_differing_part = 0;
4538 for (mj_part_t i=0; i < num_parts; i++){
4541 double scalar_required_proc = num_procs *
4542 (double (global_num_points_in_parts[i]) / double (this->num_global_coords));
4545 mj_part_t required_proc =
static_cast<mj_part_t
> (0.5 + scalar_required_proc);
4546 if (required_proc == 0) required_proc = 1;
4550 if (num_free_procs - required_proc < minimum_num_procs_required_for_rest_of_parts){
4551 required_proc = num_free_procs - (minimum_num_procs_required_for_rest_of_parts);
4555 num_free_procs -= required_proc;
4557 --minimum_num_procs_required_for_rest_of_parts;
4560 num_procs_assigned_to_each_part[i] = required_proc;
4565 double imbalance_wrt_ideal = (scalar_required_proc - required_proc) / required_proc;
4566 if (imbalance_wrt_ideal > max_imbalance_difference){
4567 max_imbalance_difference = imbalance_wrt_ideal;
4568 max_differing_part = i;
4573 if (num_free_procs > 0){
4574 num_procs_assigned_to_each_part[max_differing_part] += num_free_procs;
4581 mj_part_t *part_assignment_proc_begin_indices = allocMemory<mj_part_t>(num_parts);
4583 mj_part_t *processor_chains_in_parts = allocMemory<mj_part_t>(num_procs);
4584 mj_part_t *processor_part_assignments = allocMemory<mj_part_t>(num_procs);
4593 for (
int i = 0; i < num_procs; ++i ){
4594 processor_part_assignments[i] = -1;
4595 processor_chains_in_parts[i] = -1;
4597 for (
int i = 0; i < num_parts; ++i ){
4598 part_assignment_proc_begin_indices[i] = -1;
4604 uSignedSortItem<mj_part_t, mj_gno_t, char> * sort_item_num_part_points_in_procs = allocMemory <uSignedSortItem<mj_part_t, mj_gno_t, char> > (num_procs);
4605 for(mj_part_t i = 0; i < num_parts; ++i){
4609 for(mj_part_t ii = 0; ii < num_procs; ++ii){
4610 sort_item_num_part_points_in_procs[ii].id = ii;
4613 if (processor_part_assignments[ii] == -1){
4614 sort_item_num_part_points_in_procs[ii].val = num_points_in_all_processor_parts[ii * num_parts + i];
4615 sort_item_num_part_points_in_procs[ii].signbit = 1;
4625 sort_item_num_part_points_in_procs[ii].val = num_points_in_all_processor_parts[ii * num_parts + i];
4626 sort_item_num_part_points_in_procs[ii].signbit = 0;
4630 uqSignsort<mj_part_t, mj_gno_t,char>(num_procs, sort_item_num_part_points_in_procs);
4640 mj_part_t required_proc_count = num_procs_assigned_to_each_part[i];
4641 mj_gno_t total_num_points_in_part = global_num_points_in_parts[i];
4642 mj_gno_t ideal_num_points_in_a_proc =
4643 Teuchos::as<mj_gno_t>(ceil (total_num_points_in_part /
double (required_proc_count)));
4646 mj_part_t next_proc_to_send_index = num_procs - required_proc_count;
4647 mj_part_t next_proc_to_send_id = sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
4648 mj_lno_t space_left_in_sent_proc = ideal_num_points_in_a_proc - sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
4652 for(mj_part_t ii = num_procs - 1; ii >= num_procs - required_proc_count; --ii){
4653 mj_part_t proc_id = sort_item_num_part_points_in_procs[ii].id;
4655 processor_part_assignments[proc_id] = i;
4658 bool did_change_sign =
false;
4660 for(mj_part_t ii = 0; ii < num_procs; ++ii){
4663 if (sort_item_num_part_points_in_procs[ii].signbit == 0){
4664 did_change_sign =
true;
4665 sort_item_num_part_points_in_procs[ii].signbit = 1;
4671 if(did_change_sign){
4673 uqSignsort<mj_part_t, mj_gno_t>(num_procs - required_proc_count, sort_item_num_part_points_in_procs);
4685 if (!did_i_find_my_group){
4686 for(mj_part_t ii = num_procs - 1; ii >= num_procs - required_proc_count; --ii){
4688 mj_part_t proc_id_to_assign = sort_item_num_part_points_in_procs[ii].id;
4690 processor_ranks_for_subcomm.push_back(proc_id_to_assign);
4692 if(proc_id_to_assign == this->myRank){
4694 did_i_find_my_group =
true;
4696 part_assignment_proc_begin_indices[i] = this->myRank;
4697 processor_chains_in_parts[this->myRank] = -1;
4700 send_count_to_each_proc[this->myRank] = sort_item_num_part_points_in_procs[ii].val;
4703 for (mj_part_t in = 0; in < i; ++in){
4704 output_part_numbering_begin_index += (*next_future_num_parts_in_parts)[in];
4711 if (!did_i_find_my_group){
4712 processor_ranks_for_subcomm.clear();
4720 for(mj_part_t ii = num_procs - required_proc_count - 1; ii >= 0; --ii){
4721 mj_part_t nonassigned_proc_id = sort_item_num_part_points_in_procs[ii].id;
4722 mj_lno_t num_points_to_sent = sort_item_num_part_points_in_procs[ii].val;
4727 if (num_points_to_sent < 0) {
4728 std::cout <<
"Migration - processor assignments - for part:" << i <<
"from proc:" << nonassigned_proc_id <<
" num_points_to_sent:" << num_points_to_sent << std::endl;
4733 switch (migration_type){
4737 while (num_points_to_sent > 0){
4739 if (num_points_to_sent <= space_left_in_sent_proc){
4741 space_left_in_sent_proc -= num_points_to_sent;
4743 if (this->myRank == nonassigned_proc_id){
4745 send_count_to_each_proc[next_proc_to_send_id] = num_points_to_sent;
4748 mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
4749 part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
4750 processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
4752 num_points_to_sent = 0;
4756 if(space_left_in_sent_proc > 0){
4757 num_points_to_sent -= space_left_in_sent_proc;
4760 if (this->myRank == nonassigned_proc_id){
4762 send_count_to_each_proc[next_proc_to_send_id] = space_left_in_sent_proc;
4763 mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
4764 part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
4765 processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
4770 ++next_proc_to_send_index;
4773 if(next_part_to_send_index < nprocs - required_proc_count ){
4774 std::cout <<
"Migration - processor assignments - for part:"
4776 <<
" next_part_to_send :" << next_part_to_send_index
4777 <<
" nprocs:" << nprocs
4778 <<
" required_proc_count:" << required_proc_count
4779 <<
" Error: next_part_to_send_index < nprocs - required_proc_count" << std::endl;
4785 next_proc_to_send_id = sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
4787 space_left_in_sent_proc = ideal_num_points_in_a_proc - sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
4796 if (this->myRank == nonassigned_proc_id){
4798 send_count_to_each_proc[next_proc_to_send_id] = num_points_to_sent;
4801 mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
4802 part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
4803 processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
4805 num_points_to_sent = 0;
4806 ++next_proc_to_send_index;
4809 if (next_proc_to_send_index == num_procs){
4810 next_proc_to_send_index = num_procs - required_proc_count;
4813 next_proc_to_send_id = sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
4815 space_left_in_sent_proc = ideal_num_points_in_a_proc - sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
4828 this->assign_send_destinations(
4830 part_assignment_proc_begin_indices,
4831 processor_chains_in_parts,
4832 send_count_to_each_proc,
4833 coordinate_destinations);
4835 freeArray<mj_part_t>(part_assignment_proc_begin_indices);
4836 freeArray<mj_part_t>(processor_chains_in_parts);
4837 freeArray<mj_part_t>(processor_part_assignments);
4838 freeArray<uSignedSortItem<mj_part_t, mj_gno_t, char> > (sort_item_num_part_points_in_procs);
4839 freeArray<mj_part_t > (num_procs_assigned_to_each_part);
4856 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4858 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::assign_send_destinations2(
4859 mj_part_t num_parts,
4860 uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment,
4861 int *coordinate_destinations,
4862 mj_part_t &output_part_numbering_begin_index,
4863 std::vector<mj_part_t> *next_future_num_parts_in_parts){
4865 mj_part_t part_shift_amount = output_part_numbering_begin_index;
4866 mj_part_t previous_processor = -1;
4867 for(mj_part_t i = 0; i < num_parts; ++i){
4868 mj_part_t p = sort_item_part_to_proc_assignment[i].id;
4870 mj_lno_t part_begin_index = 0;
4871 if (p > 0) part_begin_index = this->new_part_xadj[p - 1];
4872 mj_lno_t part_end_index = this->new_part_xadj[p];
4874 mj_part_t assigned_proc = sort_item_part_to_proc_assignment[i].val;
4875 if (this->myRank == assigned_proc && previous_processor != assigned_proc){
4876 output_part_numbering_begin_index = part_shift_amount;
4878 previous_processor = assigned_proc;
4879 part_shift_amount += (*next_future_num_parts_in_parts)[p];
4881 for (mj_lno_t j=part_begin_index; j < part_end_index; j++){
4882 mj_lno_t localInd = this->new_coordinate_permutations[j];
4883 coordinate_destinations[localInd] = assigned_proc;
4905 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4907 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_assign_parts_to_procs(
4908 mj_gno_t * num_points_in_all_processor_parts,
4909 mj_part_t num_parts,
4910 mj_part_t num_procs,
4911 mj_lno_t *send_count_to_each_proc,
4912 std::vector<mj_part_t> *next_future_num_parts_in_parts,
4913 mj_part_t &out_num_part,
4914 std::vector<mj_part_t> &out_part_indices,
4915 mj_part_t &output_part_numbering_begin_index,
4916 int *coordinate_destinations){
4919 mj_gno_t *global_num_points_in_parts = num_points_in_all_processor_parts + num_procs * num_parts;
4920 out_part_indices.clear();
4924 uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment = allocMemory <uSortItem<mj_part_t, mj_part_t> >(num_parts);
4925 uSortItem<mj_part_t, mj_gno_t> * sort_item_num_points_of_proc_in_part_i = allocMemory <uSortItem<mj_part_t, mj_gno_t> >(num_procs);
4929 mj_lno_t work_each = mj_lno_t (this->num_global_coords / (
double (num_procs)) + 0.5f);
4931 mj_lno_t *space_in_each_processor = allocMemory <mj_lno_t>(num_procs);
4933 for (mj_part_t i = 0; i < num_procs; ++i){
4934 space_in_each_processor[i] = work_each;
4941 mj_part_t *num_parts_proc_assigned = allocMemory <mj_part_t>(num_procs);
4942 memset(num_parts_proc_assigned, 0,
sizeof(mj_part_t) * num_procs);
4943 int empty_proc_count = num_procs;
4947 uSortItem<mj_part_t, mj_gno_t> * sort_item_point_counts_in_parts = allocMemory <uSortItem<mj_part_t, mj_gno_t> >(num_parts);
4951 for (mj_part_t i = 0; i < num_parts; ++i){
4952 sort_item_point_counts_in_parts[i].id = i;
4953 sort_item_point_counts_in_parts[i].val = global_num_points_in_parts[i];
4956 uqsort<mj_part_t, mj_gno_t>(num_parts, sort_item_point_counts_in_parts);
4962 for (mj_part_t j = 0; j < num_parts; ++j){
4964 mj_part_t i = sort_item_point_counts_in_parts[num_parts - 1 - j].id;
4966 mj_gno_t load = global_num_points_in_parts[i];
4969 mj_part_t assigned_proc = -1;
4971 mj_part_t best_proc_to_assign = 0;
4975 for (mj_part_t ii = 0; ii < num_procs; ++ii){
4976 sort_item_num_points_of_proc_in_part_i[ii].id = ii;
4981 if (empty_proc_count < num_parts - j || num_parts_proc_assigned[ii] == 0){
4983 sort_item_num_points_of_proc_in_part_i[ii].val = num_points_in_all_processor_parts[ii * num_parts + i];
4986 sort_item_num_points_of_proc_in_part_i[ii].val = -1;
4989 uqsort<mj_part_t, mj_gno_t>(num_procs, sort_item_num_points_of_proc_in_part_i);
4992 for (mj_part_t iii = num_procs - 1; iii >= 0; --iii){
4993 mj_part_t ii = sort_item_num_points_of_proc_in_part_i[iii].id;
4994 mj_lno_t left_space = space_in_each_processor[ii] - load;
4996 if(left_space >= 0 ){
5001 if (space_in_each_processor[best_proc_to_assign] < space_in_each_processor[ii]){
5002 best_proc_to_assign = ii;
5007 if (assigned_proc == -1){
5008 assigned_proc = best_proc_to_assign;
5011 if (num_parts_proc_assigned[assigned_proc]++ == 0){
5014 space_in_each_processor[assigned_proc] -= load;
5016 sort_item_part_to_proc_assignment[j].id = i;
5017 sort_item_part_to_proc_assignment[j].val = assigned_proc;
5021 if (assigned_proc == this->myRank){
5023 out_part_indices.push_back(i);
5027 send_count_to_each_proc[assigned_proc] += num_points_in_all_processor_parts[this->myRank * num_parts + i];
5029 freeArray<mj_part_t>(num_parts_proc_assigned);
5030 freeArray< uSortItem<mj_part_t, mj_gno_t> > (sort_item_num_points_of_proc_in_part_i);
5031 freeArray<uSortItem<mj_part_t, mj_gno_t> >(sort_item_point_counts_in_parts);
5032 freeArray<mj_lno_t >(space_in_each_processor);
5036 uqsort<mj_part_t, mj_part_t>(num_parts, sort_item_part_to_proc_assignment);
5040 this->assign_send_destinations2(
5042 sort_item_part_to_proc_assignment,
5043 coordinate_destinations,
5044 output_part_numbering_begin_index,
5045 next_future_num_parts_in_parts);
5047 freeArray<uSortItem<mj_part_t, mj_part_t> >(sort_item_part_to_proc_assignment);
5068 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5070 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_migration_part_proc_assignment(
5071 mj_gno_t * num_points_in_all_processor_parts,
5072 mj_part_t num_parts,
5073 mj_part_t num_procs,
5074 mj_lno_t *send_count_to_each_proc,
5075 std::vector<mj_part_t> &processor_ranks_for_subcomm,
5076 std::vector<mj_part_t> *next_future_num_parts_in_parts,
5077 mj_part_t &out_num_part,
5078 std::vector<mj_part_t> &out_part_indices,
5079 mj_part_t &output_part_numbering_begin_index,
5080 int *coordinate_destinations){
5084 processor_ranks_for_subcomm.clear();
5086 if (num_procs > num_parts){
5091 mj_part_t out_part_index = 0;
5092 this->mj_assign_proc_to_parts(
5093 num_points_in_all_processor_parts,
5096 send_count_to_each_proc,
5097 processor_ranks_for_subcomm,
5098 next_future_num_parts_in_parts,
5100 output_part_numbering_begin_index,
5101 coordinate_destinations
5105 out_part_indices.clear();
5106 out_part_indices.push_back(out_part_index);
5113 processor_ranks_for_subcomm.push_back(this->myRank);
5117 this->mj_assign_parts_to_procs(
5118 num_points_in_all_processor_parts,
5121 send_count_to_each_proc,
5122 next_future_num_parts_in_parts,
5125 output_part_numbering_begin_index,
5126 coordinate_destinations);
5142 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5144 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_migrate_coords(
5145 mj_part_t num_procs,
5146 mj_lno_t &num_new_local_points,
5147 std::string iteration,
5148 int *coordinate_destinations,
5149 mj_part_t num_parts)
5151 #ifdef ENABLE_ZOLTAN_MIGRATION
5152 if (
sizeof(mj_lno_t) <=
sizeof(
int)) {
5158 ZOLTAN_COMM_OBJ *plan = NULL;
5159 MPI_Comm mpi_comm = Teuchos::getRawMpiComm(*(this->comm));
5160 int num_incoming_gnos = 0;
5161 int message_tag = 7859;
5163 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Migration Z1PlanCreating-" + iteration);
5164 int ierr = Zoltan_Comm_Create(
5166 int(this->num_local_coords),
5167 coordinate_destinations,
5170 &num_incoming_gnos);
5172 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Migration Z1PlanCreating-" + iteration);
5174 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Migration Z1Migration-" + iteration);
5175 mj_gno_t *incoming_gnos = allocMemory< mj_gno_t>(num_incoming_gnos);
5179 ierr = Zoltan_Comm_Do(
5182 (
char *) this->current_mj_gnos,
5184 (
char *) incoming_gnos);
5187 freeArray<mj_gno_t>(this->current_mj_gnos);
5188 this->current_mj_gnos = incoming_gnos;
5192 for (
int i = 0; i < this->coord_dim; ++i){
5194 mj_scalar_t *coord = this->mj_coordinates[i];
5196 this->mj_coordinates[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
5197 ierr = Zoltan_Comm_Do(
5201 sizeof(mj_scalar_t),
5202 (
char *) this->mj_coordinates[i]);
5204 freeArray<mj_scalar_t>(coord);
5208 for (
int i = 0; i < this->num_weights_per_coord; ++i){
5210 mj_scalar_t *weight = this->mj_weights[i];
5212 this->mj_weights[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
5213 ierr = Zoltan_Comm_Do(
5217 sizeof(mj_scalar_t),
5218 (
char *) this->mj_weights[i]);
5220 freeArray<mj_scalar_t>(weight);
5225 int *coord_own = allocMemory<int>(num_incoming_gnos);
5227 ierr = Zoltan_Comm_Do(
5230 (
char *) this->owner_of_coordinate,
5231 sizeof(
int), (
char *) coord_own);
5233 freeArray<int>(this->owner_of_coordinate);
5234 this->owner_of_coordinate = coord_own;
5240 mj_part_t *new_parts = allocMemory<mj_part_t>(num_incoming_gnos);
5241 if(num_procs < num_parts){
5243 ierr = Zoltan_Comm_Do(
5246 (
char *) this->assigned_part_ids,
5248 (
char *) new_parts);
5251 freeArray<mj_part_t>(this->assigned_part_ids);
5252 this->assigned_part_ids = new_parts;
5254 ierr = Zoltan_Comm_Destroy(&plan);
5256 num_new_local_points = num_incoming_gnos;
5257 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Migration Z1Migration-" + iteration);
5262 #endif // ENABLE_ZOLTAN_MIGRATION
5264 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Migration DistributorPlanCreating-" + iteration);
5265 Tpetra::Distributor distributor(this->comm);
5266 ArrayView<const mj_part_t> destinations( coordinate_destinations, this->num_local_coords);
5267 mj_lno_t num_incoming_gnos = distributor.createFromSends(destinations);
5268 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Migration DistributorPlanCreating-" + iteration);
5270 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Migration DistributorMigration-" + iteration);
5273 ArrayRCP<mj_gno_t> received_gnos(num_incoming_gnos);
5274 ArrayView<mj_gno_t> sent_gnos(this->current_mj_gnos, this->num_local_coords);
5275 distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
5276 freeArray<mj_gno_t>(this->current_mj_gnos);
5277 this->current_mj_gnos = allocMemory<mj_gno_t>(num_incoming_gnos);
5279 this->current_mj_gnos,
5280 received_gnos.getRawPtr(),
5281 num_incoming_gnos *
sizeof(mj_gno_t));
5284 for (
int i = 0; i < this->coord_dim; ++i){
5286 ArrayView<mj_scalar_t> sent_coord(this->mj_coordinates[i], this->num_local_coords);
5287 ArrayRCP<mj_scalar_t> received_coord(num_incoming_gnos);
5288 distributor.doPostsAndWaits<mj_scalar_t>(sent_coord, 1, received_coord());
5289 freeArray<mj_scalar_t>(this->mj_coordinates[i]);
5290 this->mj_coordinates[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
5292 this->mj_coordinates[i],
5293 received_coord.getRawPtr(),
5294 num_incoming_gnos *
sizeof(mj_scalar_t));
5298 for (
int i = 0; i < this->num_weights_per_coord; ++i){
5300 ArrayView<mj_scalar_t> sent_weight(this->mj_weights[i], this->num_local_coords);
5301 ArrayRCP<mj_scalar_t> received_weight(num_incoming_gnos);
5302 distributor.doPostsAndWaits<mj_scalar_t>(sent_weight, 1, received_weight());
5303 freeArray<mj_scalar_t>(this->mj_weights[i]);
5304 this->mj_weights[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
5306 this->mj_weights[i],
5307 received_weight.getRawPtr(),
5308 num_incoming_gnos *
sizeof(mj_scalar_t));
5313 ArrayView<int> sent_owners(this->owner_of_coordinate, this->num_local_coords);
5314 ArrayRCP<int> received_owners(num_incoming_gnos);
5315 distributor.doPostsAndWaits<
int>(sent_owners, 1, received_owners());
5316 freeArray<int>(this->owner_of_coordinate);
5317 this->owner_of_coordinate = allocMemory<int>(num_incoming_gnos);
5319 this->owner_of_coordinate,
5320 received_owners.getRawPtr(),
5321 num_incoming_gnos *
sizeof(int));
5327 if(num_procs < num_parts){
5328 ArrayView<mj_part_t> sent_partids(this->assigned_part_ids, this->num_local_coords);
5329 ArrayRCP<mj_part_t> received_partids(num_incoming_gnos);
5330 distributor.doPostsAndWaits<mj_part_t>(sent_partids, 1, received_partids());
5331 freeArray<mj_part_t>(this->assigned_part_ids);
5332 this->assigned_part_ids = allocMemory<mj_part_t>(num_incoming_gnos);
5334 this->assigned_part_ids,
5335 received_partids.getRawPtr(),
5336 num_incoming_gnos *
sizeof(mj_part_t));
5339 mj_part_t *new_parts = allocMemory<int>(num_incoming_gnos);
5340 freeArray<mj_part_t>(this->assigned_part_ids);
5341 this->assigned_part_ids = new_parts;
5343 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Migration DistributorMigration-" + iteration);
5344 num_new_local_points = num_incoming_gnos;
5355 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5357 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::create_sub_communicator(std::vector<mj_part_t> &processor_ranks_for_subcomm){
5358 mj_part_t group_size = processor_ranks_for_subcomm.size();
5359 mj_part_t *ids = allocMemory<mj_part_t>(group_size);
5360 for(mj_part_t i = 0; i < group_size; ++i) {
5361 ids[i] = processor_ranks_for_subcomm[i];
5363 ArrayView<const mj_part_t> idView(ids, group_size);
5364 this->comm = this->comm->createSubcommunicator(idView);
5365 freeArray<mj_part_t>(ids);
5374 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5376 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::fill_permutation_array(
5377 mj_part_t output_num_parts,
5378 mj_part_t num_parts){
5380 if (output_num_parts == 1){
5381 for(mj_lno_t i = 0; i < this->num_local_coords; ++i){
5382 this->new_coordinate_permutations[i] = i;
5384 this->new_part_xadj[0] = this->num_local_coords;
5391 mj_lno_t *num_points_in_parts = allocMemory<mj_lno_t>(num_parts);
5393 mj_part_t *part_shifts = allocMemory<mj_part_t>(num_parts);
5395 memset(num_points_in_parts, 0,
sizeof(mj_lno_t) * num_parts);
5397 for(mj_lno_t i = 0; i < this->num_local_coords; ++i){
5398 mj_part_t ii = this->assigned_part_ids[i];
5399 ++num_points_in_parts[ii];
5404 mj_lno_t prev_index = 0;
5405 for(mj_part_t i = 0; i < num_parts; ++i){
5406 if(num_points_in_parts[i] > 0) {
5407 this->new_part_xadj[p] = prev_index + num_points_in_parts[i];
5408 prev_index += num_points_in_parts[i];
5409 part_shifts[i] = p++;
5414 mj_part_t assigned_num_parts = p - 1;
5415 for (;p < num_parts; ++p){
5416 this->new_part_xadj[p] = this->new_part_xadj[assigned_num_parts];
5418 for(mj_part_t i = 0; i < output_num_parts; ++i){
5419 num_points_in_parts[i] = this->new_part_xadj[i];
5425 for(mj_lno_t i = this->num_local_coords - 1; i >= 0; --i){
5426 mj_part_t part = part_shifts[mj_part_t(this->assigned_part_ids[i])];
5427 this->new_coordinate_permutations[--num_points_in_parts[part]] = i;
5430 freeArray<mj_lno_t>(num_points_in_parts);
5431 freeArray<mj_part_t>(part_shifts);
5458 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5460 bool AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_perform_migration(
5461 mj_part_t input_num_parts,
5462 mj_part_t &output_num_parts,
5463 std::vector<mj_part_t> *next_future_num_parts_in_parts,
5464 mj_part_t &output_part_begin_index,
5465 size_t migration_reduce_all_population,
5466 mj_lno_t num_coords_for_last_dim_part,
5467 std::string iteration,
5468 RCP<mj_partBoxVector_t> &input_part_boxes,
5469 RCP<mj_partBoxVector_t> &output_part_boxes
5472 mj_part_t num_procs = this->comm->getSize();
5473 this->myRank = this->comm->getRank();
5479 mj_gno_t *num_points_in_all_processor_parts = allocMemory<mj_gno_t>(input_num_parts * (num_procs + 1));
5482 this->get_processor_num_points_in_parts(
5485 num_points_in_all_processor_parts);
5489 if (!this->mj_check_to_migrate(
5490 migration_reduce_all_population,
5491 num_coords_for_last_dim_part,
5494 num_points_in_all_processor_parts)){
5495 freeArray<mj_gno_t>(num_points_in_all_processor_parts);
5500 mj_lno_t *send_count_to_each_proc = NULL;
5501 int *coordinate_destinations = allocMemory<int>(this->num_local_coords);
5502 send_count_to_each_proc = allocMemory<mj_lno_t>(num_procs);
5503 for (
int i = 0; i < num_procs; ++i) send_count_to_each_proc[i] = 0;
5505 std::vector<mj_part_t> processor_ranks_for_subcomm;
5506 std::vector<mj_part_t> out_part_indices;
5509 this->mj_migration_part_proc_assignment(
5510 num_points_in_all_processor_parts,
5513 send_count_to_each_proc,
5514 processor_ranks_for_subcomm,
5515 next_future_num_parts_in_parts,
5518 output_part_begin_index,
5519 coordinate_destinations);
5524 freeArray<mj_lno_t>(send_count_to_each_proc);
5525 std::vector <mj_part_t> tmpv;
5527 std::sort (out_part_indices.begin(), out_part_indices.end());
5528 mj_part_t outP = out_part_indices.size();
5530 mj_gno_t new_global_num_points = 0;
5531 mj_gno_t *global_num_points_in_parts = num_points_in_all_processor_parts + num_procs * input_num_parts;
5533 if (this->mj_keep_part_boxes){
5534 input_part_boxes->clear();
5539 for (mj_part_t i = 0; i < outP; ++i){
5540 mj_part_t ind = out_part_indices[i];
5541 new_global_num_points += global_num_points_in_parts[ind];
5542 tmpv.push_back((*next_future_num_parts_in_parts)[ind]);
5543 if (this->mj_keep_part_boxes){
5544 input_part_boxes->push_back((*output_part_boxes)[ind]);
5548 if (this->mj_keep_part_boxes){
5549 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
5550 input_part_boxes = output_part_boxes;
5551 output_part_boxes = tmpPartBoxes;
5553 next_future_num_parts_in_parts->clear();
5554 for (mj_part_t i = 0; i < outP; ++i){
5555 mj_part_t p = tmpv[i];
5556 next_future_num_parts_in_parts->push_back(p);
5559 freeArray<mj_gno_t>(num_points_in_all_processor_parts);
5561 mj_lno_t num_new_local_points = 0;
5565 this->mj_migrate_coords(
5567 num_new_local_points,
5569 coordinate_destinations,
5573 freeArray<int>(coordinate_destinations);
5575 if(this->num_local_coords != num_new_local_points){
5576 freeArray<mj_lno_t>(this->new_coordinate_permutations);
5577 freeArray<mj_lno_t>(this->coordinate_permutations);
5579 this->new_coordinate_permutations = allocMemory<mj_lno_t>(num_new_local_points);
5580 this->coordinate_permutations = allocMemory<mj_lno_t>(num_new_local_points);
5582 this->num_local_coords = num_new_local_points;
5583 this->num_global_coords = new_global_num_points;
5588 this->create_sub_communicator(processor_ranks_for_subcomm);
5589 processor_ranks_for_subcomm.clear();
5592 this->fill_permutation_array(
5612 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5614 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::create_consistent_chunks(
5615 mj_part_t num_parts,
5616 mj_scalar_t *mj_current_dim_coords,
5617 mj_scalar_t *current_concurrent_cut_coordinate,
5618 mj_lno_t coordinate_begin,
5619 mj_lno_t coordinate_end,
5620 mj_scalar_t *used_local_cut_line_weight_to_left,
5621 mj_lno_t *out_part_xadj,
5622 int coordInd,
bool longest_dim_part, uSignedSortItem<int, mj_scalar_t, char> *p_coord_dimension_range_sorted){
5625 mj_part_t no_cuts = num_parts - 1;
5630 mj_lno_t *thread_num_points_in_parts = this->thread_point_counts[me];
5631 mj_scalar_t *my_local_thread_cut_weights_to_put_left = NULL;
5636 if (this->distribute_points_on_cut_lines){
5638 my_local_thread_cut_weights_to_put_left = this->thread_cut_line_weight_to_put_left[me];
5639 for (mj_part_t i = 0; i < no_cuts; ++i){
5641 mj_scalar_t left_weight = used_local_cut_line_weight_to_left[i];
5643 for(
int ii = 0; ii < this->num_threads; ++ii){
5644 if(left_weight > this->sEpsilon){
5646 mj_scalar_t thread_ii_weight_on_cut = this->thread_part_weight_work[ii][i * 2 + 1] - this->thread_part_weight_work[ii][i * 2 ];
5647 if(thread_ii_weight_on_cut < left_weight){
5648 this->thread_cut_line_weight_to_put_left[ii][i] = thread_ii_weight_on_cut;
5651 this->thread_cut_line_weight_to_put_left[ii][i] = left_weight ;
5653 left_weight -= thread_ii_weight_on_cut;
5656 this->thread_cut_line_weight_to_put_left[ii][i] = 0;
5664 for (mj_part_t i = no_cuts - 1; i > 0 ; --i){
5665 if(
ZOLTAN2_ABS(current_concurrent_cut_coordinate[i] - current_concurrent_cut_coordinate[i -1]) < this->sEpsilon){
5666 my_local_thread_cut_weights_to_put_left[i] -= my_local_thread_cut_weights_to_put_left[i - 1] ;
5674 for(mj_part_t ii = 0; ii < num_parts; ++ii){
5675 thread_num_points_in_parts[ii] = 0;
5687 mj_part_t *cut_map = allocMemory<mj_part_t> (no_cuts);
5690 typedef uMultiSortItem<mj_lno_t, int, mj_scalar_t> multiSItem;
5691 typedef std::vector< multiSItem > multiSVector;
5692 typedef std::vector<multiSVector> multiS2Vector;
5695 std::vector<mj_scalar_t *>allocated_memory;
5698 multiS2Vector sort_vector_points_on_cut;
5701 mj_part_t different_cut_count = 1;
5706 multiSVector tmpMultiSVector;
5707 sort_vector_points_on_cut.push_back(tmpMultiSVector);
5709 for (mj_part_t i = 1; i < no_cuts ; ++i){
5712 if(
ZOLTAN2_ABS(current_concurrent_cut_coordinate[i] - current_concurrent_cut_coordinate[i -1]) < this->sEpsilon){
5713 cut_map[i] = cut_map[i-1];
5716 cut_map[i] = different_cut_count++;
5717 multiSVector tmp2MultiSVector;
5718 sort_vector_points_on_cut.push_back(tmp2MultiSVector);
5724 for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
5726 mj_lno_t i = this->coordinate_permutations[ii];
5728 mj_part_t pp = this->assigned_part_ids[i];
5729 mj_part_t p = pp / 2;
5732 mj_scalar_t *
vals = allocMemory<mj_scalar_t>(this->coord_dim -1);
5733 allocated_memory.push_back(vals);
5738 if (longest_dim_part){
5740 for(
int dim = this->coord_dim - 2; dim >= 0; --dim){
5742 int next_largest_coord_dim = p_coord_dimension_range_sorted[dim].id;
5744 vals[val_ind++] = this->mj_coordinates[next_largest_coord_dim][i];
5748 for(
int dim = coordInd + 1; dim < this->coord_dim; ++dim){
5749 vals[val_ind++] = this->mj_coordinates[dim][i];
5751 for(
int dim = 0; dim < coordInd; ++dim){
5752 vals[val_ind++] = this->mj_coordinates[dim][i];
5755 multiSItem tempSortItem(i, this->coord_dim -1, vals);
5757 mj_part_t cmap = cut_map[p];
5758 sort_vector_points_on_cut[cmap].push_back(tempSortItem);
5762 ++thread_num_points_in_parts[p];
5763 this->assigned_part_ids[i] = p;
5768 for (mj_part_t i = 0; i < different_cut_count; ++i){
5769 std::sort (sort_vector_points_on_cut[i].begin(), sort_vector_points_on_cut[i].end());
5773 mj_part_t previous_cut_map = cut_map[0];
5782 mj_scalar_t weight_stolen_from_previous_part = 0;
5783 for (mj_part_t p = 0; p < no_cuts; ++p){
5785 mj_part_t mapped_cut = cut_map[p];
5789 if (previous_cut_map != mapped_cut){
5790 mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[previous_cut_map].size() - 1;
5791 for (; sort_vector_end >= 0; --sort_vector_end){
5792 multiSItem t = sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
5793 mj_lno_t i = t.index;
5794 ++thread_num_points_in_parts[p];
5795 this->assigned_part_ids[i] = p;
5797 sort_vector_points_on_cut[previous_cut_map].clear();
5801 mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[mapped_cut].size() - 1;
5806 for (; sort_vector_end >= 0; --sort_vector_end){
5809 multiSItem t = sort_vector_points_on_cut[mapped_cut][sort_vector_end];
5811 mj_lno_t i = t.index;
5812 mj_scalar_t w = this->mj_uniform_weights[0]? 1:this->mj_weights[0][i];
5816 if( my_local_thread_cut_weights_to_put_left[p] + weight_stolen_from_previous_part> this->sEpsilon &&
5817 my_local_thread_cut_weights_to_put_left[p] + weight_stolen_from_previous_part -
ZOLTAN2_ABS(my_local_thread_cut_weights_to_put_left[p] + weight_stolen_from_previous_part - w)
5820 my_local_thread_cut_weights_to_put_left[p] -= w;
5821 sort_vector_points_on_cut[mapped_cut].pop_back();
5822 ++thread_num_points_in_parts[p];
5823 this->assigned_part_ids[i] = p;
5826 if(p < no_cuts - 1 && my_local_thread_cut_weights_to_put_left[p] < this->sEpsilon){
5827 if(mapped_cut == cut_map[p + 1] ){
5830 if (previous_cut_map != mapped_cut){
5831 weight_stolen_from_previous_part = my_local_thread_cut_weights_to_put_left[p];
5836 weight_stolen_from_previous_part += my_local_thread_cut_weights_to_put_left[p];
5840 weight_stolen_from_previous_part = -my_local_thread_cut_weights_to_put_left[p];
5849 if(p < no_cuts - 1 && mapped_cut == cut_map[p + 1]){
5850 if (previous_cut_map != mapped_cut){
5851 weight_stolen_from_previous_part = my_local_thread_cut_weights_to_put_left[p];
5854 weight_stolen_from_previous_part += my_local_thread_cut_weights_to_put_left[p];
5858 weight_stolen_from_previous_part = -my_local_thread_cut_weights_to_put_left[p];
5864 previous_cut_map = mapped_cut;
5869 mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[previous_cut_map].size() - 1;
5874 for (; sort_vector_end >= 0; --sort_vector_end){
5877 multiSItem t = sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
5879 mj_lno_t i = t.index;
5880 ++thread_num_points_in_parts[no_cuts];
5881 this->assigned_part_ids[i] = no_cuts;
5883 sort_vector_points_on_cut[previous_cut_map].clear();
5884 freeArray<mj_part_t> (cut_map);
5887 mj_lno_t vSize = (mj_lno_t) allocated_memory.size();
5888 for(mj_lno_t i = 0; i < vSize; ++i){
5889 freeArray<mj_scalar_t> (allocated_memory[i]);
5893 for(mj_part_t j = 0; j < num_parts; ++j){
5894 mj_lno_t num_points_in_part_j_upto_thread_i = 0;
5895 for (
int i = 0; i < this->num_threads; ++i){
5896 mj_lno_t thread_num_points_in_part_j = this->thread_point_counts[i][j];
5897 this->thread_point_counts[i][j] = num_points_in_part_j_upto_thread_i;
5898 num_points_in_part_j_upto_thread_i += thread_num_points_in_part_j;
5901 out_part_xadj[j] = num_points_in_part_j_upto_thread_i;
5905 for(mj_part_t j = 1; j < num_parts; ++j){
5906 out_part_xadj[j] += out_part_xadj[j - 1];
5912 for(mj_part_t j = 1; j < num_parts; ++j){
5913 thread_num_points_in_parts[j] += out_part_xadj[j - 1] ;
5918 for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
5919 mj_lno_t i = this->coordinate_permutations[ii];
5920 mj_part_t p = this->assigned_part_ids[i];
5921 this->new_coordinate_permutations[coordinate_begin +
5922 thread_num_points_in_parts[p]++] = i;
5937 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5939 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::set_final_parts(
5940 mj_part_t current_num_parts,
5941 mj_part_t output_part_begin_index,
5942 RCP<mj_partBoxVector_t> &output_part_boxes,
5943 bool is_data_ever_migrated)
5945 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Part_Assignment");
5947 #ifdef HAVE_ZOLTAN2_OMP
5948 #pragma omp parallel for
5950 for(mj_part_t i = 0; i < current_num_parts;++i){
5953 mj_lno_t end = this->part_xadj[i];
5955 if(i > 0) begin = this->part_xadj[i -1];
5956 mj_part_t part_to_set_index = i + output_part_begin_index;
5957 if (this->mj_keep_part_boxes){
5958 (*output_part_boxes)[i].setpId(part_to_set_index);
5960 for (mj_lno_t ii = begin; ii < end; ++ii){
5961 mj_lno_t k = this->coordinate_permutations[ii];
5962 this->assigned_part_ids[k] = part_to_set_index;
5967 if(!is_data_ever_migrated){
5974 #ifdef ENABLE_ZOLTAN_MIGRATION
5975 if (
sizeof(mj_lno_t) <=
sizeof(
int)) {
5982 ZOLTAN_COMM_OBJ *plan = NULL;
5983 MPI_Comm mpi_comm = Teuchos::getRawMpiComm(*(this->mj_problemComm));
5986 int message_tag = 7856;
5988 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Final Z1PlanCreating");
5989 int ierr = Zoltan_Comm_Create( &plan,
int(this->num_local_coords),
5990 this->owner_of_coordinate, mpi_comm, message_tag,
5993 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Final Z1PlanCreating" );
5995 mj_gno_t *incoming_gnos = allocMemory< mj_gno_t>(incoming);
5998 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Final Z1PlanComm");
5999 ierr = Zoltan_Comm_Do( plan, message_tag, (
char *) this->current_mj_gnos,
6000 sizeof(mj_gno_t), (
char *) incoming_gnos);
6003 freeArray<mj_gno_t>(this->current_mj_gnos);
6004 this->current_mj_gnos = incoming_gnos;
6006 mj_part_t *incoming_partIds = allocMemory< mj_part_t>(incoming);
6009 ierr = Zoltan_Comm_Do( plan, message_tag, (
char *) this->assigned_part_ids,
6010 sizeof(mj_part_t), (
char *) incoming_partIds);
6012 freeArray<mj_part_t>(this->assigned_part_ids);
6013 this->assigned_part_ids = incoming_partIds;
6015 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Final Z1PlanComm");
6016 ierr = Zoltan_Comm_Destroy(&plan);
6019 this->num_local_coords = incoming;
6024 #endif // !ENABLE_ZOLTAN_MIGRATION
6027 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Final DistributorPlanCreating");
6028 Tpetra::Distributor distributor(this->mj_problemComm);
6029 ArrayView<const mj_part_t> owners_of_coords(this->owner_of_coordinate, this->num_local_coords);
6030 mj_lno_t incoming = distributor.createFromSends(owners_of_coords);
6031 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Final DistributorPlanCreating" );
6033 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Final DistributorPlanComm");
6035 ArrayRCP<mj_gno_t> received_gnos(incoming);
6036 ArrayView<mj_gno_t> sent_gnos(this->current_mj_gnos, this->num_local_coords);
6037 distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
6038 freeArray<mj_gno_t>(this->current_mj_gnos);
6039 this->current_mj_gnos = allocMemory<mj_gno_t>(incoming);
6040 memcpy( this->current_mj_gnos,
6041 received_gnos.getRawPtr(),
6042 incoming *
sizeof(mj_gno_t));
6045 ArrayView<mj_part_t> sent_partids(this->assigned_part_ids, this->num_local_coords);
6046 ArrayRCP<mj_part_t> received_partids(incoming);
6047 distributor.doPostsAndWaits<mj_part_t>(sent_partids, 1, received_partids());
6048 freeArray<mj_part_t>(this->assigned_part_ids);
6049 this->assigned_part_ids = allocMemory<mj_part_t>(incoming);
6050 memcpy( this->assigned_part_ids,
6051 received_partids.getRawPtr(),
6052 incoming *
sizeof(mj_part_t));
6054 this->num_local_coords = incoming;
6055 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Final DistributorPlanComm");
6060 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Part_Assignment");
6062 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Solution_Part_Assignment");
6067 if (this->mj_keep_part_boxes){
6068 this->kept_boxes = compute_global_box_boundaries(output_part_boxes);
6072 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Solution_Part_Assignment");
6077 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6079 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::free_work_memory(){
6080 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Problem_Free");
6082 for (
int i=0; i < this->coord_dim; i++){
6083 freeArray<mj_scalar_t>(this->mj_coordinates[i]);
6085 freeArray<mj_scalar_t *>(this->mj_coordinates);
6087 for (
int i=0; i < this->num_weights_per_coord; i++){
6088 freeArray<mj_scalar_t>(this->mj_weights[i]);
6090 freeArray<mj_scalar_t *>(this->mj_weights);
6092 freeArray<int>(this->owner_of_coordinate);
6094 for(
int i = 0; i < this->num_threads; ++i){
6095 freeArray<mj_lno_t>(this->thread_point_counts[i]);
6098 freeArray<mj_lno_t *>(this->thread_point_counts);
6099 freeArray<double *> (this->thread_part_weight_work);
6101 if(this->distribute_points_on_cut_lines){
6102 freeArray<mj_scalar_t>(this->process_cut_line_weight_to_put_left);
6103 for(
int i = 0; i < this->num_threads; ++i){
6104 freeArray<mj_scalar_t>(this->thread_cut_line_weight_to_put_left[i]);
6106 freeArray<mj_scalar_t *>(this->thread_cut_line_weight_to_put_left);
6107 freeArray<mj_scalar_t>(this->process_rectilinear_cut_weight);
6108 freeArray<mj_scalar_t>(this->global_rectilinear_cut_weight);
6111 freeArray<mj_part_t>(this->my_incomplete_cut_count);
6113 freeArray<mj_scalar_t>(this->max_min_coords);
6115 freeArray<mj_lno_t>(this->part_xadj);
6117 freeArray<mj_lno_t>(this->coordinate_permutations);
6119 freeArray<mj_lno_t>(this->new_coordinate_permutations);
6121 freeArray<mj_scalar_t>(this->all_cut_coordinates);
6123 freeArray<mj_scalar_t> (this->process_local_min_max_coord_total_weight);
6125 freeArray<mj_scalar_t> (this->global_min_max_coord_total_weight);
6127 freeArray<mj_scalar_t>(this->cut_coordinates_work_array);
6129 freeArray<mj_scalar_t>(this->target_part_weights);
6131 freeArray<mj_scalar_t>(this->cut_upper_bound_coordinates);
6133 freeArray<mj_scalar_t>(this->cut_lower_bound_coordinates);
6135 freeArray<mj_scalar_t>(this->cut_lower_bound_weights);
6136 freeArray<mj_scalar_t>(this->cut_upper_bound_weights);
6137 freeArray<bool>(this->is_cut_line_determined);
6138 freeArray<mj_scalar_t>(this->total_part_weight_left_right_closests);
6139 freeArray<mj_scalar_t>(this->global_total_part_weight_left_right_closests);
6141 for(
int i = 0; i < this->num_threads; ++i){
6142 freeArray<double>(this->thread_part_weights[i]);
6143 freeArray<mj_scalar_t>(this->thread_cut_right_closest_point[i]);
6144 freeArray<mj_scalar_t>(this->thread_cut_left_closest_point[i]);
6147 freeArray<double *>(this->thread_part_weights);
6148 freeArray<mj_scalar_t *>(this->thread_cut_left_closest_point);
6149 freeArray<mj_scalar_t *>(this->thread_cut_right_closest_point);
6151 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Problem_Free");
6162 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6165 bool distribute_points_on_cut_lines_,
6166 int max_concurrent_part_calculation_,
6167 int check_migrate_avoid_migration_option_,
6168 double minimum_migration_imbalance_,
6169 int migration_type_ ){
6170 this->distribute_points_on_cut_lines = distribute_points_on_cut_lines_;
6171 this->max_concurrent_part_calculation = max_concurrent_part_calculation_;
6172 this->check_migrate_avoid_migration_option = check_migrate_avoid_migration_option_;
6173 this->minimum_migration_imbalance = minimum_migration_imbalance_;
6174 this->migration_type = migration_type_;
6208 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6212 const RCP<const Environment> &env,
6213 RCP<
const Comm<int> > &problemComm,
6215 double imbalance_tolerance_,
6216 size_t num_global_parts_,
6217 mj_part_t *part_no_array_,
6218 int recursion_depth_,
6221 mj_lno_t num_local_coords_,
6222 mj_gno_t num_global_coords_,
6223 const mj_gno_t *initial_mj_gnos_,
6224 mj_scalar_t **mj_coordinates_,
6226 int num_weights_per_coord_,
6227 bool *mj_uniform_weights_,
6228 mj_scalar_t **mj_weights_,
6229 bool *mj_uniform_parts_,
6230 mj_scalar_t **mj_part_sizes_,
6232 mj_part_t *&result_assigned_part_ids_,
6233 mj_gno_t *&result_mj_gnos_
6240 if(comm->getRank() == 0){
6241 std::cout <<
"size of gno:" <<
sizeof(mj_gno_t) << std::endl;
6242 std::cout <<
"size of lno:" <<
sizeof(mj_lno_t) << std::endl;
6243 std::cout <<
"size of mj_scalar_t:" <<
sizeof(mj_scalar_t) << std::endl;
6247 this->mj_problemComm = problemComm;
6248 this->myActualRank = this->myRank = this->mj_problemComm->getRank();
6270 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Total");
6271 this->mj_env->debug(3,
"In MultiJagged Jagged");
6274 this->imbalance_tolerance = imbalance_tolerance_;
6275 this->num_global_parts = num_global_parts_;
6276 this->part_no_array = part_no_array_;
6277 this->recursion_depth = recursion_depth_;
6279 this->coord_dim = coord_dim_;
6280 this->num_local_coords = num_local_coords_;
6281 this->num_global_coords = num_global_coords_;
6282 this->mj_coordinates = mj_coordinates_;
6283 this->initial_mj_gnos = (mj_gno_t *) initial_mj_gnos_;
6285 this->num_weights_per_coord = num_weights_per_coord_;
6286 this->mj_uniform_weights = mj_uniform_weights_;
6287 this->mj_weights = mj_weights_;
6288 this->mj_uniform_parts = mj_uniform_parts_;
6289 this->mj_part_sizes = mj_part_sizes_;
6291 this->num_threads = 1;
6292 #ifdef HAVE_ZOLTAN2_OMP
6293 #pragma omp parallel
6296 this->num_threads = omp_get_num_threads();
6301 this->set_part_specifications();
6303 this->allocate_set_work_memory();
6307 this->comm = this->mj_problemComm->duplicate();
6310 mj_part_t current_num_parts = 1;
6311 mj_scalar_t *current_cut_coordinates = this->all_cut_coordinates;
6313 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Problem_Partitioning");
6315 mj_part_t output_part_begin_index = 0;
6316 mj_part_t future_num_parts = this->total_num_part;
6317 bool is_data_ever_migrated =
false;
6319 std::vector<mj_part_t> *future_num_part_in_parts =
new std::vector<mj_part_t> ();
6320 std::vector<mj_part_t> *next_future_num_parts_in_parts =
new std::vector<mj_part_t> ();
6321 next_future_num_parts_in_parts->push_back(this->num_global_parts);
6323 RCP<mj_partBoxVector_t> input_part_boxes(
new mj_partBoxVector_t(),
true) ;
6324 RCP<mj_partBoxVector_t> output_part_boxes(
new mj_partBoxVector_t(),
true);
6326 compute_global_box();
6327 if(this->mj_keep_part_boxes){
6328 this->init_part_boxes(output_part_boxes);
6331 for (
int i = 0; i < this->recursion_depth; ++i){
6334 std::vector <mj_part_t> num_partitioning_in_current_dim;
6348 std::vector<mj_part_t> *tmpPartVect= future_num_part_in_parts;
6349 future_num_part_in_parts = next_future_num_parts_in_parts;
6350 next_future_num_parts_in_parts = tmpPartVect;
6355 next_future_num_parts_in_parts->clear();
6357 if(this->mj_keep_part_boxes){
6358 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
6359 input_part_boxes = output_part_boxes;
6360 output_part_boxes = tmpPartBoxes;
6361 output_part_boxes->clear();
6365 mj_part_t output_part_count_in_dimension =
6366 this->update_part_num_arrays(
6367 num_partitioning_in_current_dim,
6368 future_num_part_in_parts,
6369 next_future_num_parts_in_parts,
6374 output_part_boxes, 1);
6379 if(output_part_count_in_dimension == current_num_parts) {
6381 tmpPartVect= future_num_part_in_parts;
6382 future_num_part_in_parts = next_future_num_parts_in_parts;
6383 next_future_num_parts_in_parts = tmpPartVect;
6385 if(this->mj_keep_part_boxes){
6386 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
6387 input_part_boxes = output_part_boxes;
6388 output_part_boxes = tmpPartBoxes;
6395 int coordInd = i % this->coord_dim;
6396 mj_scalar_t * mj_current_dim_coords = this->mj_coordinates[coordInd];
6399 std::string istring = Teuchos::toString<int>(i);
6400 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Problem_Partitioning_" + istring);
6404 this->new_part_xadj = allocMemory<mj_lno_t>(output_part_count_in_dimension);
6407 mj_part_t output_part_index = 0;
6410 mj_part_t output_coordinate_end_index = 0;
6412 mj_part_t current_work_part = 0;
6413 mj_part_t current_concurrent_num_parts =
6414 std::min(current_num_parts - current_work_part, this->max_concurrent_part_calculation);
6416 mj_part_t obtained_part_index = 0;
6419 for (; current_work_part < current_num_parts;
6420 current_work_part += current_concurrent_num_parts){
6422 current_concurrent_num_parts = std::min(current_num_parts - current_work_part,
6423 this->max_concurrent_part_calculation);
6425 mj_part_t actual_work_part_count = 0;
6429 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk){
6430 mj_part_t current_work_part_in_concurrent_parts = current_work_part + kk;
6434 if (num_partitioning_in_current_dim[current_work_part_in_concurrent_parts] == 1){
6437 ++actual_work_part_count;
6438 mj_lno_t coordinate_end_index= this->part_xadj[current_work_part_in_concurrent_parts];
6439 mj_lno_t coordinate_begin_index = current_work_part_in_concurrent_parts == 0 ?
6440 0 : this->part_xadj[current_work_part_in_concurrent_parts - 1];
6448 this->mj_get_local_min_max_coord_totW(
6449 coordinate_begin_index,
6450 coordinate_end_index,
6451 this->coordinate_permutations,
6452 mj_current_dim_coords,
6453 this->process_local_min_max_coord_total_weight[kk],
6454 this->process_local_min_max_coord_total_weight[kk + current_concurrent_num_parts],
6455 this->process_local_min_max_coord_total_weight[kk + 2*current_concurrent_num_parts]);
6460 if (actual_work_part_count > 0){
6462 this->mj_get_global_min_max_coord_totW(
6463 current_concurrent_num_parts,
6464 this->process_local_min_max_coord_total_weight,
6465 this->global_min_max_coord_total_weight);
6469 mj_part_t total_incomplete_cut_count = 0;
6474 mj_part_t concurrent_part_cut_shift = 0;
6475 mj_part_t concurrent_part_part_shift = 0;
6476 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk){
6477 mj_scalar_t min_coordinate = this->global_min_max_coord_total_weight[kk];
6478 mj_scalar_t max_coordinate = this->global_min_max_coord_total_weight[kk +
6479 current_concurrent_num_parts];
6481 mj_scalar_t global_total_weight = this->global_min_max_coord_total_weight[kk +
6482 2 * current_concurrent_num_parts];
6484 mj_part_t concurrent_current_part_index = current_work_part + kk;
6486 mj_part_t partition_count = num_partitioning_in_current_dim[concurrent_current_part_index];
6488 mj_scalar_t *usedCutCoordinate = current_cut_coordinates + concurrent_part_cut_shift;
6489 mj_scalar_t *current_target_part_weights = this->target_part_weights +
6490 concurrent_part_part_shift;
6492 concurrent_part_cut_shift += partition_count - 1;
6494 concurrent_part_part_shift += partition_count;
6499 if(partition_count > 1 && min_coordinate <= max_coordinate){
6503 total_incomplete_cut_count += partition_count - 1;
6506 this->my_incomplete_cut_count[kk] = partition_count - 1;
6509 this->mj_get_initial_cut_coords_target_weights(
6512 partition_count - 1,
6513 global_total_weight,
6515 current_target_part_weights,
6516 future_num_part_in_parts,
6517 next_future_num_parts_in_parts,
6518 concurrent_current_part_index,
6519 obtained_part_index);
6521 mj_lno_t coordinate_end_index= this->part_xadj[concurrent_current_part_index];
6522 mj_lno_t coordinate_begin_index = concurrent_current_part_index == 0 ?
6523 0 : this->part_xadj[concurrent_current_part_index - 1];
6527 this->set_initial_coordinate_parts(
6530 concurrent_current_part_index,
6531 coordinate_begin_index, coordinate_end_index,
6532 this->coordinate_permutations,
6533 mj_current_dim_coords,
6534 this->assigned_part_ids,
6539 this->my_incomplete_cut_count[kk] = 0;
6541 obtained_part_index += partition_count;
6548 double used_imbalance = 0;
6553 mj_current_dim_coords,
6556 current_concurrent_num_parts,
6557 current_cut_coordinates,
6558 total_incomplete_cut_count,
6559 num_partitioning_in_current_dim);
6564 mj_part_t output_array_shift = 0;
6565 mj_part_t cut_shift = 0;
6566 size_t tlr_shift = 0;
6567 size_t partweight_array_shift = 0;
6569 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk){
6570 mj_part_t current_concurrent_work_part = current_work_part + kk;
6571 mj_part_t num_parts = num_partitioning_in_current_dim[current_concurrent_work_part];
6574 if((num_parts != 1 )
6576 this->global_min_max_coord_total_weight[kk] >
6577 this->global_min_max_coord_total_weight[kk + current_concurrent_num_parts]) {
6581 for(mj_part_t jj = 0; jj < num_parts; ++jj){
6582 this->new_part_xadj[output_part_index + output_array_shift + jj] = 0;
6584 cut_shift += num_parts - 1;
6585 tlr_shift += (4 *(num_parts - 1) + 1);
6586 output_array_shift += num_parts;
6587 partweight_array_shift += (2 * (num_parts - 1) + 1);
6591 mj_lno_t coordinate_end= this->part_xadj[current_concurrent_work_part];
6592 mj_lno_t coordinate_begin = current_concurrent_work_part==0 ? 0: this->part_xadj[
6593 current_concurrent_work_part -1];
6594 mj_scalar_t *current_concurrent_cut_coordinate = current_cut_coordinates + cut_shift;
6595 mj_scalar_t *used_local_cut_line_weight_to_left = this->process_cut_line_weight_to_put_left +
6600 for(
int ii = 0; ii < this->num_threads; ++ii){
6601 this->thread_part_weight_work[ii] = this->thread_part_weights[ii] + partweight_array_shift;
6605 if(this->mj_keep_part_boxes){
6607 for (mj_part_t j = 0; j < num_parts - 1; ++j){
6608 (*output_part_boxes)[output_array_shift + output_part_index +
6609 j].updateMinMax(current_concurrent_cut_coordinate[j], 1
6612 (*output_part_boxes)[output_array_shift + output_part_index + j +
6613 1].updateMinMax(current_concurrent_cut_coordinate[j], 0
6619 this->mj_create_new_partitions(
6621 mj_current_dim_coords,
6622 current_concurrent_cut_coordinate,
6625 used_local_cut_line_weight_to_left,
6626 this->thread_part_weight_work,
6627 this->new_part_xadj + output_part_index + output_array_shift
6634 mj_lno_t part_size = coordinate_end - coordinate_begin;
6635 *(this->new_part_xadj + output_part_index + output_array_shift) = part_size;
6637 this->new_coordinate_permutations + coordinate_begin,
6638 this->coordinate_permutations + coordinate_begin,
6639 part_size *
sizeof(mj_lno_t));
6641 cut_shift += num_parts - 1;
6642 tlr_shift += (4 *(num_parts - 1) + 1);
6643 output_array_shift += num_parts;
6644 partweight_array_shift += (2 * (num_parts - 1) + 1);
6654 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
6655 mj_part_t num_parts = num_partitioning_in_current_dim[ current_work_part + kk];
6656 for (mj_part_t ii = 0;ii < num_parts ; ++ii){
6658 this->new_part_xadj[output_part_index+ii] += output_coordinate_end_index;
6661 output_coordinate_end_index = this->new_part_xadj[output_part_index + num_parts - 1];
6663 output_part_index += num_parts ;
6670 int current_world_size = this->comm->getSize();
6671 long migration_reduce_all_population = this->total_dim_num_reduce_all * current_world_size;
6674 bool is_migrated_in_current_dimension =
false;
6679 if (future_num_parts > 1 &&
6680 this->check_migrate_avoid_migration_option >= 0 &&
6681 current_world_size > 1){
6683 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Problem_Migration-" + istring);
6684 mj_part_t num_parts = output_part_count_in_dimension;
6685 if ( this->mj_perform_migration(
6688 next_future_num_parts_in_parts,
6689 output_part_begin_index,
6690 migration_reduce_all_population,
6691 this->num_global_coords / (future_num_parts * current_num_parts),
6693 input_part_boxes, output_part_boxes) ) {
6694 is_migrated_in_current_dimension =
true;
6695 is_data_ever_migrated =
true;
6696 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Problem_Migration-" +
6699 this->total_dim_num_reduce_all /= num_parts;
6702 is_migrated_in_current_dimension =
false;
6703 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Problem_Migration-" + istring);
6708 mj_lno_t * tmp = this->coordinate_permutations;
6709 this->coordinate_permutations = this->new_coordinate_permutations;
6710 this->new_coordinate_permutations = tmp;
6712 if(!is_migrated_in_current_dimension){
6713 this->total_dim_num_reduce_all -= current_num_parts;
6714 current_num_parts = output_part_count_in_dimension;
6716 freeArray<mj_lno_t>(this->part_xadj);
6717 this->part_xadj = this->new_part_xadj;
6718 this->new_part_xadj = NULL;
6719 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Problem_Partitioning_" + istring);
6723 delete future_num_part_in_parts;
6724 delete next_future_num_parts_in_parts;
6726 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Problem_Partitioning");
6733 this->set_final_parts(
6735 output_part_begin_index,
6737 is_data_ever_migrated);
6739 result_assigned_part_ids_ = this->assigned_part_ids;
6740 result_mj_gnos_ = this->current_mj_gnos;
6742 this->free_work_memory();
6743 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Total");
6744 this->mj_env->debug(3,
"Out of MultiJagged");
6752 template <
typename Adapter>
6757 #ifndef DOXYGEN_SHOULD_SKIP_THIS
6764 typedef typename Adapter::scalar_t adapter_scalar_t;
6767 typedef float default_mj_scalar_t;
6773 (std::is_same<adapter_scalar_t, float>::value ||
6774 std::is_same<adapter_scalar_t, double>::value),
6775 adapter_scalar_t, default_mj_scalar_t>::type mj_scalar_t;
6777 typedef typename Adapter::gno_t mj_gno_t;
6778 typedef typename Adapter::lno_t mj_lno_t;
6779 typedef typename Adapter::node_t mj_node_t;
6782 typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
6786 RCP<const Environment> mj_env;
6787 RCP<const Comm<int> > mj_problemComm;
6788 RCP<const coordinateModel_t> mj_coords;
6791 double imbalance_tolerance;
6792 size_t num_global_parts;
6793 mj_part_t *part_no_array;
6794 int recursion_depth;
6797 mj_lno_t num_local_coords;
6798 mj_gno_t num_global_coords;
6799 const mj_gno_t *initial_mj_gnos;
6800 mj_scalar_t **mj_coordinates;
6802 int num_weights_per_coord;
6803 bool *mj_uniform_weights;
6804 mj_scalar_t **mj_weights;
6805 bool *mj_uniform_parts;
6806 mj_scalar_t **mj_part_sizes;
6814 mj_part_t num_first_level_parts;
6815 const mj_part_t *first_level_distribution;
6817 bool distribute_points_on_cut_lines;
6818 mj_part_t max_concurrent_part_calculation;
6819 int check_migrate_avoid_migration_option;
6822 double minimum_migration_imbalance;
6823 bool mj_keep_part_boxes;
6828 int mj_premigration_option;
6829 int min_coord_per_rank_for_premigration;
6831 ArrayRCP<mj_part_t> comXAdj_;
6832 ArrayRCP<mj_part_t> comAdj_;
6838 ArrayRCP<const mj_scalar_t> * coordinate_ArrayRCP_holder;
6840 void set_up_partitioning_data(
6843 void set_input_parameters(
const Teuchos::ParameterList &p);
6845 void free_work_memory();
6847 RCP<mj_partBoxVector_t> getGlobalBoxBoundaries()
const;
6849 bool mj_premigrate_to_subset(
int used_num_ranks,
int migration_selection_option,
6850 RCP<const Environment> mj_env_,
6851 RCP<
const Comm<int> > mj_problemComm_,
6853 mj_lno_t num_local_coords_,
6854 mj_gno_t num_global_coords_,
size_t num_global_parts_,
6855 const mj_gno_t *initial_mj_gnos_,
6856 mj_scalar_t **mj_coordinates_,
6857 int num_weights_per_coord_,
6858 mj_scalar_t **mj_weights_,
6860 RCP<
const Comm<int> > &result_problemComm_,
6861 mj_lno_t & result_num_local_coords_,
6862 mj_gno_t * &result_initial_mj_gnos_,
6863 mj_scalar_t ** &result_mj_coordinates_,
6864 mj_scalar_t ** &result_mj_weights_,
6865 int * &result_actual_owner_rank_);
6870 RCP<
const Comm<int> > &problemComm,
6871 const RCP<const coordinateModel_t> &coords) :
6872 mj_partitioner(), mj_env(env),
6873 mj_problemComm(problemComm),
6875 imbalance_tolerance(0),
6876 num_global_parts(1),
6877 part_no_array(NULL),
6880 num_local_coords(0),
6881 num_global_coords(0),
6882 initial_mj_gnos(NULL),
6883 mj_coordinates(NULL),
6884 num_weights_per_coord(0),
6885 mj_uniform_weights(NULL),
6887 mj_uniform_parts(NULL),
6888 mj_part_sizes(NULL),
6889 num_first_level_parts(1),
6890 first_level_distribution(NULL),
6891 distribute_points_on_cut_lines(true),
6892 max_concurrent_part_calculation(1),
6893 check_migrate_avoid_migration_option(0),
6895 minimum_migration_imbalance(0.30),
6896 mj_keep_part_boxes(false),
6898 mj_run_as_rcb(false),
6899 mj_premigration_option(0),
6900 min_coord_per_rank_for_premigration(32000),
6901 comXAdj_(), comAdj_(),
6902 coordinate_ArrayRCP_holder(NULL)
6906 if (coordinate_ArrayRCP_holder != NULL){
6907 delete [] this->coordinate_ArrayRCP_holder;
6908 this->coordinate_ArrayRCP_holder = NULL;
6916 const bool bUnsorted =
true;
6917 RCP<Zoltan2::IntegerRangeListValidator<int>> mj_parts_Validator =
6919 pl.set(
"mj_parts",
"0",
"list of parts for multiJagged partitioning "
6920 "algorithm. As many as the dimension count.", mj_parts_Validator);
6922 pl.set(
"mj_concurrent_part_count", 1,
"The number of parts whose cut "
6925 pl.set(
"mj_minimum_migration_imbalance", 1.1,
6926 "mj_minimum_migration_imbalance, the minimum imbalance of the "
6927 "processors to avoid migration",
6930 RCP<Teuchos::EnhancedNumberValidator<int>> mj_migration_option_validator =
6931 Teuchos::rcp(
new Teuchos::EnhancedNumberValidator<int>(0, 2) );
6932 pl.set(
"mj_migration_option", 1,
"Migration option, 0 for decision "
6933 "depending on the imbalance, 1 for forcing migration, 2 for "
6934 "avoiding migration", mj_migration_option_validator);
6939 RCP<Teuchos::EnhancedNumberValidator<int>> mj_migration_type_validator =
6940 Teuchos::rcp(
new Teuchos::EnhancedNumberValidator<int>(0, 1) );
6941 pl.set(
"mj_migration_type", 0,
"Migration type, 0 for migration to minimize the imbalance "
6942 "1 for migration to minimize messages exchanged the migration." ,
6943 mj_migration_option_validator);
6946 pl.set(
"mj_keep_part_boxes",
false,
"Keep the part boundaries of the "
6950 pl.set(
"mj_enable_rcb",
false,
"Use MJ as RCB.",
6953 pl.set(
"mj_recursion_depth", -1,
"Recursion depth for MJ: Must be "
6956 RCP<Teuchos::EnhancedNumberValidator<int>> mj_premigration_option_validator =
6957 Teuchos::rcp(
new Teuchos::EnhancedNumberValidator<int>(0, 1024) );
6959 pl.set(
"mj_premigration_option", 0,
"Whether to do premigration or not. 0 for no migration "
6960 "x > 0 for migration to consecutive processors, the subset will be 0,x,2x,3x,...subset ranks."
6961 , mj_premigration_option_validator);
6963 pl.set(
"mj_premigration_coordinate_count", 32000,
"How many coordinate to assign each rank in multijagged after premigration"
6978 RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
6982 mj_part_t
pointAssign(
int dim, adapter_scalar_t *point)
const;
6984 void boxAssign(
int dim, adapter_scalar_t *lower, adapter_scalar_t *upper,
6985 size_t &nPartsFound, mj_part_t **partsFound)
const;
6992 ArrayRCP<mj_part_t> &comXAdj,
6993 ArrayRCP<mj_part_t> &comAdj);
6999 template <
typename Adapter>
7000 bool Zoltan2_AlgMJ<Adapter>::mj_premigrate_to_subset(
int used_num_ranks,
7002 RCP<const Environment> mj_env_,
7003 RCP<
const Comm<int> > mj_problemComm_,
7005 mj_lno_t num_local_coords_,
7007 const mj_gno_t *initial_mj_gnos_,
7008 mj_scalar_t **mj_coordinates_,
7009 int num_weights_per_coord_,
7010 mj_scalar_t **mj_weights_,
7012 RCP<
const Comm<int> > &result_problemComm_,
7013 mj_lno_t &result_num_local_coords_,
7014 mj_gno_t * &result_initial_mj_gnos_,
7015 mj_scalar_t ** &result_mj_coordinates_,
7016 mj_scalar_t ** &result_mj_weights_,
7017 int * &result_actual_owner_rank_){
7018 mj_env_->timerStart(
MACRO_TIMERS,
"MultiJagged - PreMigration DistributorPlanCreating");
7021 int myRank = mj_problemComm_->getRank();
7022 int worldSize = mj_problemComm_->getSize();
7024 mj_part_t groupsize = worldSize / used_num_ranks;
7028 std::vector<mj_part_t> group_begins(used_num_ranks + 1, 0);
7030 mj_part_t i_am_sending_to = 0;
7031 bool am_i_a_receiver =
false;
7033 for(
int i = 0; i < used_num_ranks; ++i){
7034 group_begins[i+ 1] = group_begins[i] + groupsize;
7035 if (worldSize % used_num_ranks > i) group_begins[i+ 1] += 1;
7036 if (i == used_num_ranks) group_begins[i+ 1] = worldSize;
7037 if (myRank >= group_begins[i] && myRank < group_begins[i + 1]) i_am_sending_to = group_begins[i];
7038 if (myRank == group_begins[i]) am_i_a_receiver=
true;
7041 ArrayView<const mj_part_t> idView(&(group_begins[0]), used_num_ranks );
7042 result_problemComm_ = mj_problemComm_->createSubcommunicator(idView);
7045 Tpetra::Distributor distributor(mj_problemComm_);
7047 std::vector<mj_part_t> coordinate_destinations(num_local_coords_, i_am_sending_to);
7048 ArrayView<const mj_part_t> destinations( &(coordinate_destinations[0]), num_local_coords_);
7049 mj_lno_t num_incoming_gnos = distributor.createFromSends(destinations);
7050 result_num_local_coords_ = num_incoming_gnos;
7051 mj_env_->timerStop(
MACRO_TIMERS,
"MultiJagged - PreMigration DistributorPlanCreating");
7053 mj_env_->timerStart(
MACRO_TIMERS,
"MultiJagged - PreMigration DistributorMigration");
7057 ArrayRCP<mj_gno_t> received_gnos(num_incoming_gnos);
7059 ArrayView<const mj_gno_t> sent_gnos(initial_mj_gnos_, num_local_coords_);
7060 distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
7062 result_initial_mj_gnos_ = allocMemory<mj_gno_t>(num_incoming_gnos);
7064 result_initial_mj_gnos_,
7065 received_gnos.getRawPtr(),
7066 num_incoming_gnos *
sizeof(mj_gno_t));
7070 result_mj_coordinates_ = allocMemory<mj_scalar_t *>(coord_dim_);
7071 for (
int i = 0; i < coord_dim_; ++i){
7072 ArrayView<const mj_scalar_t> sent_coord(mj_coordinates_[i], num_local_coords_);
7073 ArrayRCP<mj_scalar_t> received_coord(num_incoming_gnos);
7074 distributor.doPostsAndWaits<mj_scalar_t>(sent_coord, 1, received_coord());
7075 result_mj_coordinates_[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
7077 result_mj_coordinates_[i],
7078 received_coord.getRawPtr(),
7079 num_incoming_gnos *
sizeof(mj_scalar_t));
7082 result_mj_weights_ = allocMemory<mj_scalar_t *>(num_weights_per_coord_);
7084 for (
int i = 0; i < num_weights_per_coord_; ++i){
7085 ArrayView<const mj_scalar_t> sent_weight(mj_weights_[i], num_local_coords_);
7086 ArrayRCP<mj_scalar_t> received_weight(num_incoming_gnos);
7087 distributor.doPostsAndWaits<mj_scalar_t>(sent_weight, 1, received_weight());
7088 result_mj_weights_[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
7090 result_mj_weights_[i],
7091 received_weight.getRawPtr(),
7092 num_incoming_gnos *
sizeof(mj_scalar_t));
7097 std::vector<int> owner_of_coordinate(num_local_coords_, myRank);
7098 ArrayView<int> sent_owners(&(owner_of_coordinate[0]), num_local_coords_);
7099 ArrayRCP<int> received_owners(num_incoming_gnos);
7100 distributor.doPostsAndWaits<
int>(sent_owners, 1, received_owners());
7101 result_actual_owner_rank_ = allocMemory<int>(num_incoming_gnos);
7103 result_actual_owner_rank_,
7104 received_owners.getRawPtr(),
7105 num_incoming_gnos *
sizeof(int));
7107 mj_env_->timerStop(
MACRO_TIMERS,
"MultiJagged - PreMigration DistributorMigration");
7108 return am_i_a_receiver;
7126 template <
typename Adapter>
7131 this->set_up_partitioning_data(solution);
7132 this->set_input_parameters(this->mj_env->getParameters());
7133 if (this->mj_keep_part_boxes){
7134 this->mj_partitioner.set_to_keep_part_boxes();
7136 this->mj_partitioner.set_partitioning_parameters(
7137 this->distribute_points_on_cut_lines,
7138 this->max_concurrent_part_calculation,
7139 this->check_migrate_avoid_migration_option,
7140 this->minimum_migration_imbalance, this->migration_type);
7143 RCP<const Comm<int> > result_problemComm = this->mj_problemComm;
7144 mj_lno_t result_num_local_coords = this->num_local_coords;
7145 mj_gno_t * result_initial_mj_gnos = NULL;
7146 mj_scalar_t **result_mj_coordinates = this->mj_coordinates;
7147 mj_scalar_t **result_mj_weights = this->mj_weights;
7148 int *result_actual_owner_rank = NULL;
7149 const mj_gno_t * result_initial_mj_gnos_ = this->initial_mj_gnos;
7166 int current_world_size = this->mj_problemComm->getSize();
7167 mj_lno_t threshold_num_local_coords = this->min_coord_per_rank_for_premigration;
7168 bool is_pre_migrated =
false;
7169 bool am_i_in_subset =
true;
7170 if ( mj_premigration_option > 0 &&
7171 size_t (current_world_size) > this->num_global_parts &&
7172 this->num_global_coords < mj_gno_t (current_world_size * threshold_num_local_coords)){
7173 if (this->mj_keep_part_boxes){
7174 throw std::logic_error(
"Multijagged: mj_keep_part_boxes and mj_premigration_option are not supported together yet.");
7176 is_pre_migrated =
true;
7177 int migration_selection_option = mj_premigration_option;
7178 if(migration_selection_option * this->num_global_parts > (
size_t) (current_world_size)){
7179 migration_selection_option = current_world_size / this->num_global_parts;
7181 int used_num_ranks = int (this->num_global_coords /
float (threshold_num_local_coords) + 0.5);
7182 if (used_num_ranks == 0) used_num_ranks = 1;
7184 am_i_in_subset = this->mj_premigrate_to_subset(
7186 migration_selection_option,
7188 this->mj_problemComm,
7190 this->num_local_coords,
7191 this->num_global_coords,
7192 this->num_global_parts,
7193 this->initial_mj_gnos,
7194 this->mj_coordinates,
7195 this->num_weights_per_coord,
7199 result_num_local_coords,
7200 result_initial_mj_gnos,
7201 result_mj_coordinates,
7203 result_actual_owner_rank);
7204 result_initial_mj_gnos_ = result_initial_mj_gnos;
7209 mj_part_t *result_assigned_part_ids = NULL;
7210 mj_gno_t *result_mj_gnos = NULL;
7212 if (am_i_in_subset){
7213 this->mj_partitioner.multi_jagged_part(
7217 this->imbalance_tolerance,
7218 this->num_global_parts,
7219 this->part_no_array,
7220 this->recursion_depth,
7223 result_num_local_coords,
7224 this->num_global_coords,
7225 result_initial_mj_gnos_,
7226 result_mj_coordinates,
7228 this->num_weights_per_coord,
7229 this->mj_uniform_weights,
7231 this->mj_uniform_parts,
7232 this->mj_part_sizes,
7234 result_assigned_part_ids,
7242 #if defined(__cplusplus) && __cplusplus >= 201103L
7243 std::unordered_map<mj_gno_t, mj_lno_t> localGidToLid;
7244 localGidToLid.reserve(result_num_local_coords);
7245 for (mj_lno_t i = 0; i < result_num_local_coords; i++)
7246 localGidToLid[result_initial_mj_gnos_[i]] = i;
7247 ArrayRCP<mj_part_t> partId = arcp(
new mj_part_t[result_num_local_coords],
7248 0, result_num_local_coords,
true);
7250 for (mj_lno_t i = 0; i < result_num_local_coords; i++) {
7251 mj_lno_t origLID = localGidToLid[result_mj_gnos[i]];
7252 partId[origLID] = result_assigned_part_ids[i];
7256 Teuchos::Hashtable<mj_gno_t, mj_lno_t>
7257 localGidToLid(result_num_local_coords);
7258 for (mj_lno_t i = 0; i < result_num_local_coords; i++)
7259 localGidToLid.put(result_initial_mj_gnos_[i], i);
7261 ArrayRCP<mj_part_t> partId = arcp(
new mj_part_t[result_num_local_coords],
7262 0, result_num_local_coords,
true);
7264 for (mj_lno_t i = 0; i < result_num_local_coords; i++) {
7265 mj_lno_t origLID = localGidToLid.get(result_mj_gnos[i]);
7266 partId[origLID] = result_assigned_part_ids[i];
7269 #endif // C++11 is enabled
7271 delete [] result_mj_gnos;
7272 delete [] result_assigned_part_ids;
7277 if (is_pre_migrated){
7278 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - PostMigration DistributorPlanCreating");
7279 Tpetra::Distributor distributor(this->mj_problemComm);
7281 ArrayView<const mj_part_t> actual_owner_destinations( result_actual_owner_rank , result_num_local_coords);
7282 mj_lno_t num_incoming_gnos = distributor.createFromSends(actual_owner_destinations);
7283 if (num_incoming_gnos != this->num_local_coords){
7284 throw std::logic_error(
"Zoltan2 - Multijagged Post Migration - num incoming is not equal to num local coords");
7286 mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - PostMigration DistributorPlanCreating");
7287 mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - PostMigration DistributorMigration");
7288 ArrayRCP<mj_gno_t> received_gnos(num_incoming_gnos);
7289 ArrayRCP<mj_part_t> received_partids(num_incoming_gnos);
7291 ArrayView<const mj_gno_t> sent_gnos(result_initial_mj_gnos_, result_num_local_coords);
7292 distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
7295 ArrayView<mj_part_t> sent_partnos(partId());
7296 distributor.doPostsAndWaits<mj_part_t>(sent_partnos, 1, received_partids());
7298 partId = arcp(
new mj_part_t[this->num_local_coords],
7299 0, this->num_local_coords,
true);
7302 #if defined(__cplusplus) && __cplusplus >= 201103L
7303 std::unordered_map<mj_gno_t, mj_lno_t> localGidToLid2;
7304 localGidToLid2.reserve(this->num_local_coords);
7305 for (mj_lno_t i = 0; i < this->num_local_coords; i++)
7306 localGidToLid2[this->initial_mj_gnos[i]] = i;
7309 for (mj_lno_t i = 0; i < this->num_local_coords; i++) {
7310 mj_lno_t origLID = localGidToLid2[received_gnos[i]];
7311 partId[origLID] = received_partids[i];
7315 Teuchos::Hashtable<mj_gno_t, mj_lno_t>
7316 localGidToLid2(this->num_local_coords);
7317 for (mj_lno_t i = 0; i < this->num_local_coords; i++)
7318 localGidToLid2.put(this->initial_mj_gnos[i], i);
7321 for (mj_lno_t i = 0; i < this->num_local_coords; i++) {
7322 mj_lno_t origLID = localGidToLid2.get(received_gnos[i]);
7323 partId[origLID] = received_partids[i];
7326 #endif // C++11 is enabled
7331 freeArray<mj_gno_t> (result_initial_mj_gnos);
7332 for (
int i = 0; i < this->coord_dim; ++i){
7333 freeArray<mj_scalar_t> (result_mj_coordinates[i]);
7335 freeArray<mj_scalar_t *> (result_mj_coordinates);
7337 for (
int i = 0; i < this->num_weights_per_coord; ++i){
7338 freeArray<mj_scalar_t> (result_mj_weights[i]);
7340 freeArray<mj_scalar_t *> (result_mj_weights);
7341 freeArray<int> (result_actual_owner_rank);
7343 mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - PostMigration DistributorMigration");
7347 solution->setParts(partId);
7348 this->free_work_memory();
7353 template <
typename Adapter>
7355 freeArray<mj_scalar_t *>(this->mj_coordinates);
7356 freeArray<mj_scalar_t *>(this->mj_weights);
7357 freeArray<bool>(this->mj_uniform_parts);
7358 freeArray<mj_scalar_t *>(this->mj_part_sizes);
7359 freeArray<bool>(this->mj_uniform_weights);
7365 template <
typename Adapter>
7366 void Zoltan2_AlgMJ<Adapter>::set_up_partitioning_data(
7367 const RCP<PartitioningSolution<Adapter> > &solution
7370 this->coord_dim = this->mj_coords->getCoordinateDim();
7371 this->num_weights_per_coord = this->mj_coords->getNumWeightsPerCoordinate();
7372 this->num_local_coords = this->mj_coords->getLocalNumCoordinates();
7373 this->num_global_coords = this->mj_coords->getGlobalNumCoordinates();
7374 int criteria_dim = (this->num_weights_per_coord ? this->num_weights_per_coord : 1);
7379 this->num_global_parts = solution->getTargetGlobalNumberOfParts();
7382 this->mj_coordinates = allocMemory<mj_scalar_t *>(this->coord_dim);
7383 this->mj_weights = allocMemory<mj_scalar_t *>(criteria_dim);
7386 this->mj_uniform_parts = allocMemory< bool >(criteria_dim);
7389 this->mj_part_sizes = allocMemory<mj_scalar_t *>(criteria_dim);
7391 this->mj_uniform_weights = allocMemory< bool >(criteria_dim);
7393 typedef StridedData<mj_lno_t, adapter_scalar_t> input_t;
7394 ArrayView<const mj_gno_t> gnos;
7395 ArrayView<input_t> xyz;
7396 ArrayView<input_t> wgts;
7399 this->coordinate_ArrayRCP_holder =
new ArrayRCP<const mj_scalar_t> [this->coord_dim + this->num_weights_per_coord];
7401 this->mj_coords->getCoordinates(gnos, xyz, wgts);
7403 ArrayView<const mj_gno_t> mj_gnos = gnos;
7404 this->initial_mj_gnos = mj_gnos.getRawPtr();
7407 for (
int dim=0; dim < this->coord_dim; dim++){
7408 ArrayRCP<const mj_scalar_t> ar;
7409 xyz[dim].getInputArray(ar);
7411 this->coordinate_ArrayRCP_holder[dim] = ar;
7414 this->mj_coordinates[dim] = (mj_scalar_t *)ar.getRawPtr();
7418 if (this->num_weights_per_coord == 0){
7419 this->mj_uniform_weights[0] =
true;
7420 this->mj_weights[0] = NULL;
7424 for (
int wdim = 0; wdim < this->num_weights_per_coord; wdim++){
7425 ArrayRCP<const mj_scalar_t> ar;
7426 wgts[wdim].getInputArray(ar);
7429 this->coordinate_ArrayRCP_holder[this->coord_dim + wdim] = ar;
7430 this->mj_uniform_weights[wdim] =
false;
7431 this->mj_weights[wdim] = (mj_scalar_t *) ar.getRawPtr();
7435 for (
int wdim = 0; wdim < criteria_dim; wdim++){
7436 if (solution->criteriaHasUniformPartSizes(wdim)){
7437 this->mj_uniform_parts[wdim] =
true;
7438 this->mj_part_sizes[wdim] = NULL;
7441 std::cerr <<
"MJ does not support non uniform target part weights" << std::endl;
7450 template <
typename Adapter>
7451 void Zoltan2_AlgMJ<Adapter>::set_input_parameters(
const Teuchos::ParameterList &pl){
7453 const Teuchos::ParameterEntry *pe = pl.getEntryPtr(
"imbalance_tolerance");
7456 tol = pe->getValue(&tol);
7457 this->imbalance_tolerance = tol - 1.0;
7461 if (this->imbalance_tolerance <= 0)
7462 this->imbalance_tolerance= 10e-4;
7465 this->part_no_array = NULL;
7467 this->recursion_depth = 0;
7469 if (pl.getPtr<Array <mj_part_t> >(
"mj_parts")){
7470 this->part_no_array = (mj_part_t *) pl.getPtr<Array <mj_part_t> >(
"mj_parts")->getRawPtr();
7471 this->recursion_depth = pl.getPtr<Array <mj_part_t> >(
"mj_parts")->size() - 1;
7472 this->mj_env->debug(2,
"mj_parts provided by user");
7476 this->distribute_points_on_cut_lines =
true;
7477 this->max_concurrent_part_calculation = 1;
7479 this->mj_run_as_rcb =
false;
7480 this->mj_premigration_option = 0;
7481 this->min_coord_per_rank_for_premigration = 32000;
7483 int mj_user_recursion_depth = -1;
7484 this->mj_keep_part_boxes =
false;
7485 this->check_migrate_avoid_migration_option = 0;
7486 this->migration_type = 0;
7487 this->minimum_migration_imbalance = 0.35;
7489 pe = pl.getEntryPtr(
"mj_minimum_migration_imbalance");
7492 imb = pe->getValue(&imb);
7493 this->minimum_migration_imbalance = imb - 1.0;
7496 pe = pl.getEntryPtr(
"mj_migration_option");
7498 this->check_migrate_avoid_migration_option = pe->getValue(&this->check_migrate_avoid_migration_option);
7500 this->check_migrate_avoid_migration_option = 0;
7502 if (this->check_migrate_avoid_migration_option > 1) this->check_migrate_avoid_migration_option = -1;
7505 pe = pl.getEntryPtr(
"mj_migration_type");
7507 this->migration_type = pe->getValue(&this->migration_type);
7509 this->migration_type = 0;
7514 pe = pl.getEntryPtr(
"mj_concurrent_part_count");
7516 this->max_concurrent_part_calculation = pe->getValue(&this->max_concurrent_part_calculation);
7518 this->max_concurrent_part_calculation = 1;
7521 pe = pl.getEntryPtr(
"mj_keep_part_boxes");
7523 this->mj_keep_part_boxes = pe->getValue(&this->mj_keep_part_boxes);
7525 this->mj_keep_part_boxes =
false;
7537 if (this->mj_keep_part_boxes ==
false){
7538 pe = pl.getEntryPtr(
"mapping_type");
7540 int mapping_type = -1;
7541 mapping_type = pe->getValue(&mapping_type);
7542 if (mapping_type == 0){
7543 mj_keep_part_boxes =
true;
7549 pe = pl.getEntryPtr(
"mj_enable_rcb");
7551 this->mj_run_as_rcb = pe->getValue(&this->mj_run_as_rcb);
7553 this->mj_run_as_rcb =
false;
7556 pe = pl.getEntryPtr(
"mj_premigration_option");
7558 mj_premigration_option = pe->getValue(&mj_premigration_option);
7560 mj_premigration_option = 0;
7563 pe = pl.getEntryPtr(
"mj_premigration_coordinate_count");
7565 min_coord_per_rank_for_premigration = pe->getValue(&min_coord_per_rank_for_premigration);
7567 min_coord_per_rank_for_premigration = 32000;
7570 pe = pl.getEntryPtr(
"mj_recursion_depth");
7572 mj_user_recursion_depth = pe->getValue(&mj_user_recursion_depth);
7574 mj_user_recursion_depth = -1;
7578 pe = pl.getEntryPtr(
"rectilinear");
7579 if (pe) val = pe->getValue(&val);
7581 this->distribute_points_on_cut_lines =
false;
7583 this->distribute_points_on_cut_lines =
true;
7586 if (this->mj_run_as_rcb){
7587 mj_user_recursion_depth = (int)(ceil(log ((this->num_global_parts)) / log (2.0)));
7589 if (this->recursion_depth < 1){
7590 if (mj_user_recursion_depth > 0){
7591 this->recursion_depth = mj_user_recursion_depth;
7594 this->recursion_depth = this->coord_dim;
7598 this->num_threads = 1;
7599 #ifdef HAVE_ZOLTAN2_OMP
7600 #pragma omp parallel
7602 this->num_threads = omp_get_num_threads();
7609 template <
typename Adapter>
7612 adapter_scalar_t *lower,
7613 adapter_scalar_t *upper,
7614 size_t &nPartsFound,
7624 if (this->mj_keep_part_boxes) {
7627 RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
7629 size_t nBoxes = (*partBoxes).size();
7631 throw std::logic_error(
"no part boxes exist");
7635 RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
7637 if (globalBox->boxesOverlap(dim, lower, upper)) {
7639 std::vector<typename Adapter::part_t> partlist;
7642 for (
size_t i = 0; i < nBoxes; i++) {
7644 if ((*partBoxes)[i].boxesOverlap(dim, lower, upper)) {
7646 partlist.push_back((*partBoxes)[i].getpId());
7667 *partsFound =
new mj_part_t[nPartsFound];
7668 for (
size_t i = 0; i < nPartsFound; i++)
7669 (*partsFound)[i] = partlist[i];
7682 throw std::logic_error(
"need to use keep_cuts parameter for boxAssign");
7687 template <
typename Adapter>
7690 adapter_scalar_t *point)
const
7697 if (this->mj_keep_part_boxes) {
7701 RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
7703 size_t nBoxes = (*partBoxes).size();
7705 throw std::logic_error(
"no part boxes exist");
7709 RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
7711 if (globalBox->pointInBox(dim, point)) {
7715 for (i = 0; i < nBoxes; i++) {
7717 if ((*partBoxes)[i].pointInBox(dim, point)) {
7718 foundPart = (*partBoxes)[i].getpId();
7732 std::ostringstream oss;
7734 for (
int j = 0; j < dim; j++) oss << point[j] <<
" ";
7735 oss <<
") not found in domain";
7736 throw std::logic_error(oss.str());
7746 size_t closestBox = 0;
7747 coord_t minDistance = std::numeric_limits<coord_t>::max();
7748 coord_t *centroid =
new coord_t[dim];
7749 for (
size_t i = 0; i < nBoxes; i++) {
7750 (*partBoxes)[i].computeCentroid(centroid);
7753 for (
int j = 0; j < dim; j++) {
7754 diff = centroid[j] - point[j];
7757 if (sum < minDistance) {
7762 foundPart = (*partBoxes)[closestBox].getpId();
7769 throw std::logic_error(
"need to use keep_cuts parameter for pointAssign");
7773 template <
typename Adapter>
7779 if(comXAdj_.getRawPtr() == NULL && comAdj_.getRawPtr() == NULL){
7780 RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
7781 mj_part_t ntasks = (*pBoxes).size();
7782 int dim = (*pBoxes)[0].getDim();
7783 GridHash grid(pBoxes, ntasks, dim);
7791 template <
typename Adapter>
7792 RCP<typename Zoltan2_AlgMJ<Adapter>::mj_partBoxVector_t>
7795 return this->mj_partitioner.get_kept_boxes();
7799 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
7801 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::mj_partBoxVector_t>
7804 if (this->mj_keep_part_boxes)
7805 return this->kept_boxes;
7807 throw std::logic_error(
"Error: part boxes are not stored.");
7810 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
7812 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::mj_partBoxVector_t>
7814 RCP<mj_partBoxVector_t> &localPartBoxes
7818 mj_part_t ntasks = this->num_global_parts;
7819 int dim = (*localPartBoxes)[0].getDim();
7820 coord_t *localPartBoundaries =
new coord_t[ntasks * 2 *dim];
7822 memset(localPartBoundaries, 0,
sizeof(coord_t) * ntasks * 2 *dim);
7824 coord_t *globalPartBoundaries =
new coord_t[ntasks * 2 *dim];
7825 memset(globalPartBoundaries, 0,
sizeof(coord_t) * ntasks * 2 *dim);
7827 coord_t *localPartMins = localPartBoundaries;
7828 coord_t *localPartMaxs = localPartBoundaries + ntasks * dim;
7830 coord_t *globalPartMins = globalPartBoundaries;
7831 coord_t *globalPartMaxs = globalPartBoundaries + ntasks * dim;
7833 mj_part_t boxCount = localPartBoxes->size();
7834 for (mj_part_t i = 0; i < boxCount; ++i){
7835 mj_part_t pId = (*localPartBoxes)[i].getpId();
7838 coord_t *lmins = (*localPartBoxes)[i].getlmins();
7839 coord_t *lmaxs = (*localPartBoxes)[i].getlmaxs();
7841 for (
int j = 0; j < dim; ++j){
7842 localPartMins[dim * pId + j] = lmins[j];
7843 localPartMaxs[dim * pId + j] = lmaxs[j];
7855 reduceAll<int, coord_t>(*mj_problemComm, reductionOp,
7856 ntasks * 2 *dim, localPartBoundaries, globalPartBoundaries);
7857 RCP<mj_partBoxVector_t> pB(
new mj_partBoxVector_t(),
true);
7858 for (mj_part_t i = 0; i < ntasks; ++i){
7860 globalPartMaxs + dim * i);
7872 delete []localPartBoundaries;
7873 delete []globalPartBoundaries;
#define MIN_WORK_LAST_DIM
GridHash Class, Hashing Class for part boxes.
Time an algorithm (or other entity) as a whole.
void set(IT index_, CT count_, WT *vals_)
#define Z2_FORWARD_EXCEPTIONS
Forward an exception back through call stack.
Defines Parameter related enumerators, declares functions.
AlgMJ()
Multi Jagged coordinate partitioning algorithm default constructor.
static RCP< Teuchos::BoolParameterEntryValidator > getBoolValidator()
Exists to make setting up validators less cluttered.
void getAdjArrays(ArrayRCP< part_t > &comXAdj_, ArrayRCP< part_t > &comAdj_)
GridHash Class, returns the adj arrays.
Zoltan2_BoxBoundaries(Ordinal s_)
Constructor.
void sequential_task_partitioning(const RCP< const Environment > &env, mj_lno_t num_total_coords, mj_lno_t num_selected_coords, size_t num_target_part, int coord_dim, mj_scalar_t **mj_coordinates, mj_lno_t *initial_selected_coords_output_permutation, mj_lno_t *output_xadj, int recursion_depth, const mj_part_t *part_no_array, bool partition_along_longest_dim, int num_ranks_per_node, bool divide_to_prime_first_, mj_part_t num_first_level_parts_=1, const mj_part_t *first_level_distribution_=NULL)
Special function for partitioning for task mapping. Runs sequential, and performs deterministic parti...
Sort items for quick sort function.
void uqSignsort(IT n, uSignedSortItem< IT, WT, SIGN > *arr)
Quick sort function. Sorts the arr of uSignedSortItems, with respect to increasing vals...
#define imbalanceOf2(Wachieved, wExpected)
RCP< mj_partBoxVector_t > get_kept_boxes() const
void freeArray(T *&array)
Frees the given array.
Class for sorting items with multiple values. First sorting with respect to val[0], then val[1] then ... val[count-1]. The last tie breaking is done with index values. Used for task mapping partitioning where the points on a cut line needs to be distributed consistently.
void multi_jagged_part(const RCP< const Environment > &env, RCP< const Comm< int > > &problemComm, double imbalance_tolerance, size_t num_global_parts, mj_part_t *part_no_array, int recursion_depth, int coord_dim, mj_lno_t num_local_coords, mj_gno_t num_global_coords, const mj_gno_t *initial_mj_gnos, mj_scalar_t **mj_coordinates, int num_weights_per_coord, bool *mj_uniform_weights, mj_scalar_t **mj_weights, bool *mj_uniform_parts, mj_scalar_t **mj_part_sizes, mj_part_t *&result_assigned_part_ids, mj_gno_t *&result_mj_gnos)
Multi Jagged coordinate partitioning algorithm.
static RCP< Teuchos::AnyNumberParameterEntryValidator > getAnyDoubleValidator()
Exists to make setting up validators less cluttered.
void partition(const RCP< PartitioningSolution< Adapter > > &solution)
Multi Jagged coordinate partitioning algorithm.
void reduce(const Ordinal count, const T inBuffer[], T inoutBuffer[]) const
Implement Teuchos::ValueTypeReductionOp interface.
coordinateModelPartBox Class, represents the boundaries of the box which is a result of a geometric p...
A ParameterList validator for integer range lists.
void set_to_keep_part_boxes()
Function call, if the part boxes are intended to be kept.
void getCommunicationGraph(const PartitioningSolution< Adapter > *solution, ArrayRCP< mj_part_t > &comXAdj, ArrayRCP< mj_part_t > &comAdj)
returns communication graph resulting from MJ partitioning.
bool operator>=(const uSignedSortItem< IT, WT, SIGN > &rhs)
SparseMatrixAdapter_t::part_t part_t
#define FUTURE_REDUCEALL_CUTOFF
Multi Jagged coordinate partitioning algorithm.
This class provides geometric coordinates with optional weights to the Zoltan2 algorithm.
void boxAssign(int dim, adapter_scalar_t *lower, adapter_scalar_t *upper, size_t &nPartsFound, mj_part_t **partsFound) const
T * allocMemory(size_t size)
Allocates memory for the given size.
uMultiSortItem< IT, CT, WT > operator=(const uMultiSortItem< IT, CT, WT > &other)
A PartitioningSolution is a solution to a partitioning problem.
Zoltan2_BoxBoundaries()
Default Constructor.
RCP< mj_partBoxVector_t > compute_global_box_boundaries(RCP< mj_partBoxVector_t > &localPartBoxes) const
uMultiSortItem(const uMultiSortItem< IT, CT, WT > &other)
#define LEAST_SIGNIFICANCE
Zoltan2_BoxBoundaries is a reduction operation to all reduce the all box boundaries.
Algorithm defines the base class for all algorithms.
mj_partBoxVector_t & getPartBoxesView() const
for partitioning methods, return bounding boxes of the
#define Z2_ASSERT_VALUE(actual, expected)
Throw an error when actual value is not equal to expected value.
static RCP< Teuchos::AnyNumberParameterEntryValidator > getAnyIntValidator()
Exists to make setting up validators less cluttered.
RCP< mj_partBox_t > get_global_box() const
Return the global bounding box: min/max coords of global domain.
uMultiSortItem(IT index_, CT count_, WT *vals_)
Define IntegerRangeList validator.
Defines the CoordinateModel classes.
bool operator>(const uSignedSortItem< IT, WT, SIGN > &rhs) const
bool operator>(const uMultiSortItem< IT, CT, WT > &other) const
Tpetra::global_size_t global_size_t
mj_part_t pointAssign(int dim, adapter_scalar_t *point) const
#define Z2_THROW_OUTSIDE_ERROR(env)
Throw an error returned from outside the Zoltan2 library.
#define ZOLTAN2_ALGMULTIJAGGED_SWAP(a, b, temp)
void set_partitioning_parameters(bool distribute_points_on_cut_lines_, int max_concurrent_part_calculation_, int check_migrate_avoid_migration_option_, double minimum_migration_imbalance_, int migration_type_=0)
Multi Jagged coordinate partitioning algorithm.
A gathering of useful namespace methods.
void uqsort(IT n, uSortItem< IT, WT > *arr)
Quick sort function. Sorts the arr of uSortItems, with respect to increasing vals.
Zoltan2_AlgMJ(const RCP< const Environment > &env, RCP< const Comm< int > > &problemComm, const RCP< const coordinateModel_t > &coords)
Contains Teuchos redcution operators for the Multi-jagged algorthm.
Multi Jagged coordinate partitioning algorithm.
static void getValidParameters(ParameterList &pl)
Set up validators specific to this algorithm.