49 #ifndef _ZOLTAN2_ALGMultiJagged_HPP_
50 #define _ZOLTAN2_ALGMultiJagged_HPP_
57 #include <Teuchos_StandardParameterEntryValidators.hpp>
59 #include <Tpetra_Distributor.hpp>
60 #include <Teuchos_ParameterList.hpp>
67 #if defined(__cplusplus) && __cplusplus >= 201103L
68 #include <unordered_map>
70 #include <Teuchos_Hashtable.hpp>
71 #endif // C++11 is enabled
73 #ifdef ZOLTAN2_USEZOLTANCOMM
74 #ifdef HAVE_ZOLTAN2_MPI
75 #define ENABLE_ZOLTAN_MIGRATION
76 #include "zoltan_comm_cpp.h"
77 #include "zoltan_types.h"
81 #ifdef HAVE_ZOLTAN2_OMP
85 #define LEAST_SIGNIFICANCE 0.0001
86 #define SIGNIFICANCE_MUL 1000
91 #define FUTURE_REDUCEALL_CUTOFF 1500000
94 #define MIN_WORK_LAST_DIM 1000
99 #define ZOLTAN2_ABS(x) ((x) >= 0 ? (x) : -(x))
101 #define imbalanceOf(Wachieved, totalW, expectedRatio) \
102 (Wachieved) / ((totalW) * (expectedRatio)) - 1
103 #define imbalanceOf2(Wachieved, wExpected) \
104 (Wachieved) / (wExpected) - 1
107 #define ZOLTAN2_ALGMULTIJAGGED_SWAP(a,b,temp) temp=(a);(a)=(b);(b)=temp;
116 template <
typename Ordinal,
typename T>
135 size(s_), _EPSILON (std::numeric_limits<T>::
epsilon()){}
139 void reduce(
const Ordinal count,
const T inBuffer[], T inoutBuffer[])
const
141 for (Ordinal i=0; i < count; i++){
142 if (
Z2_ABS(inBuffer[i]) > _EPSILON){
143 inoutBuffer[i] = inBuffer[i];
155 template <
typename T>
160 throw "cannot allocate memory";
172 template <
typename T>
188 template <
typename IT,
typename CT,
typename WT>
209 this->
index = index_;
210 this->
count = count_;
226 void set(IT index_ ,CT count_, WT *vals_){
227 this->
index = index_;
228 this->
count = count_;
240 bool operator<(const uMultiSortItem<IT,CT,WT>& other)
const{
241 assert (this->
count == other.count);
242 for(CT i = 0; i < this->
count; ++i){
248 if(this->
val[i] < other.val[i]){
257 return this->
index < other.index;
261 for(CT i = 0; i < this->
count; ++i){
267 if(this->
val[i] > other.
val[i]){
284 template <
class IT,
class WT>
295 template <
class IT,
class WT>
301 IT i, ir=n, j, k, l=1;
302 IT jstack=0, istack[50];
311 for (j=l+1;j<=ir;j++)
317 if (arr[i].val <= aval)
333 if (arr[l+1].val > arr[ir].val)
337 if (arr[l].val > arr[ir].val)
341 if (arr[l+1].val > arr[l].val)
351 do i++;
while (arr[i].val < aval);
352 do j--;
while (arr[j].val > aval);
359 if (jstack > NSTACK){
360 std::cout <<
"uqsort: NSTACK too small in sort." << std::endl;
379 template <
class IT,
class WT,
class SIGN>
386 bool operator<(const uSignedSortItem<IT, WT, SIGN>& rhs)
const {
388 if (this->
signbit < rhs.signbit){
392 else if (this->
signbit == rhs.signbit){
394 if (this->
val < rhs.val){
398 else if (this->
val > rhs.val){
424 else if (this->
val > rhs.
val){
437 bool operator<=(const uSignedSortItem<IT, WT, SIGN>& rhs){
438 return !(*
this > rhs);}
440 return !(*
this < rhs);}
446 template <
class IT,
class WT,
class SIGN>
451 IT i, ir=n, j, k, l=1;
452 IT jstack=0, istack[50];
460 for (j=l+1;j<=ir;j++)
482 if (arr[l+1] > arr[ir])
486 if (arr[l] > arr[ir])
490 if (arr[l+1] > arr[l])
499 do i++;
while (arr[i] < a);
500 do j--;
while (arr[j] > a);
507 if (jstack > NSTACK){
508 std::cout <<
"uqsort: NSTACK too small in sort." << std::endl;
530 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
536 typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
538 RCP<const Environment> mj_env;
539 RCP<const Comm<int> > mj_problemComm;
541 double imbalance_tolerance;
542 mj_part_t *part_no_array;
544 int coord_dim, num_weights_per_coord;
546 size_t initial_num_loc_coords;
549 mj_lno_t num_local_coords;
550 mj_gno_t num_global_coords;
552 mj_scalar_t **mj_coordinates;
553 mj_scalar_t **mj_weights;
554 bool *mj_uniform_parts;
555 mj_scalar_t **mj_part_sizes;
556 bool *mj_uniform_weights;
558 ArrayView<const mj_gno_t> mj_gnos;
559 size_t num_global_parts;
561 mj_gno_t *initial_mj_gnos;
562 mj_gno_t *current_mj_gnos;
563 int *owner_of_coordinate;
565 mj_lno_t *coordinate_permutations;
566 mj_lno_t *new_coordinate_permutations;
567 mj_part_t *assigned_part_ids;
570 mj_lno_t *new_part_xadj;
573 bool distribute_points_on_cut_lines;
574 mj_part_t max_concurrent_part_calculation;
577 int mj_user_recursion_depth;
578 bool mj_keep_part_boxes;
580 int check_migrate_avoid_migration_option;
583 double minimum_migration_imbalance;
586 mj_part_t total_num_cut ;
587 mj_part_t total_num_part;
589 mj_part_t max_num_part_along_dim ;
590 mj_part_t max_num_cut_along_dim;
591 size_t max_num_total_part_along_dim;
593 mj_part_t total_dim_num_reduce_all;
594 mj_part_t last_dim_num_part;
598 RCP<Comm<int> > comm;
600 mj_scalar_t sEpsilon;
602 mj_scalar_t maxScalar_t;
603 mj_scalar_t minScalar_t;
605 mj_scalar_t *all_cut_coordinates;
606 mj_scalar_t *max_min_coords;
607 mj_scalar_t *process_cut_line_weight_to_put_left;
608 mj_scalar_t **thread_cut_line_weight_to_put_left;
614 mj_scalar_t *cut_coordinates_work_array;
617 mj_scalar_t *target_part_weights;
619 mj_scalar_t *cut_upper_bound_coordinates ;
620 mj_scalar_t *cut_lower_bound_coordinates ;
621 mj_scalar_t *cut_lower_bound_weights ;
622 mj_scalar_t *cut_upper_bound_weights ;
624 mj_scalar_t *process_local_min_max_coord_total_weight ;
625 mj_scalar_t *global_min_max_coord_total_weight ;
629 bool *is_cut_line_determined;
632 mj_part_t *my_incomplete_cut_count;
634 double **thread_part_weights;
636 double **thread_part_weight_work;
639 mj_scalar_t **thread_cut_left_closest_point;
641 mj_scalar_t **thread_cut_right_closest_point;
644 mj_lno_t **thread_point_counts;
646 mj_scalar_t *process_rectilinear_cut_weight;
647 mj_scalar_t *global_rectilinear_cut_weight;
653 mj_scalar_t *total_part_weight_left_right_closests ;
654 mj_scalar_t *global_total_part_weight_left_right_closests;
656 RCP<mj_partBoxVector_t> kept_boxes;
659 RCP<mj_partBox_t> global_box;
660 int myRank, myActualRank;
662 bool divide_to_prime_first;
671 void set_part_specifications();
678 inline mj_part_t get_part_count(
679 mj_part_t num_total_future,
685 void allocate_set_work_memory();
692 void init_part_boxes(RCP<mj_partBoxVector_t> & outPartBoxes);
695 void compute_global_box();
711 mj_part_t update_part_num_arrays(
712 std::vector<mj_part_t> &num_partitioning_in_current_dim,
713 std::vector<mj_part_t> *future_num_part_in_parts,
714 std::vector<mj_part_t> *next_future_num_parts_in_parts,
715 mj_part_t &future_num_parts,
716 mj_part_t current_num_parts,
717 int current_iteration,
718 RCP<mj_partBoxVector_t> input_part_boxes,
719 RCP<mj_partBoxVector_t> output_part_boxes,
720 mj_part_t atomic_part_count);
733 void mj_get_local_min_max_coord_totW(
734 mj_lno_t coordinate_begin_index,
735 mj_lno_t coordinate_end_index,
736 mj_lno_t *mj_current_coordinate_permutations,
737 mj_scalar_t *mj_current_dim_coords,
738 mj_scalar_t &min_coordinate,
739 mj_scalar_t &max_coordinate,
740 mj_scalar_t &total_weight);
749 void mj_get_global_min_max_coord_totW(
750 mj_part_t current_concurrent_num_parts,
751 mj_scalar_t *local_min_max_total,
752 mj_scalar_t *global_min_max_total);
772 void mj_get_initial_cut_coords_target_weights(
773 mj_scalar_t min_coord,
774 mj_scalar_t max_coord,
776 mj_scalar_t global_weight,
777 mj_scalar_t *initial_cut_coords ,
778 mj_scalar_t *target_part_weights ,
780 std::vector <mj_part_t> *future_num_part_in_parts,
781 std::vector <mj_part_t> *next_future_num_parts_in_parts,
782 mj_part_t concurrent_current_part,
783 mj_part_t obtained_part_index);
797 void set_initial_coordinate_parts(
798 mj_scalar_t &max_coordinate,
799 mj_scalar_t &min_coordinate,
800 mj_part_t &concurrent_current_part_index,
801 mj_lno_t coordinate_begin_index,
802 mj_lno_t coordinate_end_index,
803 mj_lno_t *mj_current_coordinate_permutations,
804 mj_scalar_t *mj_current_dim_coords,
805 mj_part_t *mj_part_ids,
806 mj_part_t &partition_count);
819 mj_scalar_t *mj_current_dim_coords,
820 double imbalanceTolerance,
821 mj_part_t current_work_part,
822 mj_part_t current_concurrent_num_parts,
823 mj_scalar_t *current_cut_coordinates,
824 mj_part_t total_incomplete_cut_count,
825 std::vector <mj_part_t> &num_partitioning_in_current_dim);
846 void mj_1D_part_get_thread_part_weights(
847 size_t total_part_count,
849 mj_scalar_t max_coord,
850 mj_scalar_t min_coord,
851 mj_lno_t coordinate_begin_index,
852 mj_lno_t coordinate_end_index,
853 mj_scalar_t *mj_current_dim_coords,
854 mj_scalar_t *temp_current_cut_coords,
855 bool *current_cut_status,
856 double *my_current_part_weights,
857 mj_scalar_t *my_current_left_closest,
858 mj_scalar_t *my_current_right_closest);
867 void mj_accumulate_thread_results(
868 const std::vector <mj_part_t> &num_partitioning_in_current_dim,
869 mj_part_t current_work_part,
870 mj_part_t current_concurrent_num_parts);
902 void mj_get_new_cut_coordinates(
903 const size_t &num_total_part,
904 const mj_part_t &num_cuts,
905 const mj_scalar_t &max_coordinate,
906 const mj_scalar_t &min_coordinate,
907 const mj_scalar_t &global_total_weight,
908 const double &used_imbalance_tolerance,
909 mj_scalar_t * current_global_part_weights,
910 const mj_scalar_t * current_local_part_weights,
911 const mj_scalar_t *current_part_target_weights,
912 bool *current_cut_line_determined,
913 mj_scalar_t *current_cut_coordinates,
914 mj_scalar_t *current_cut_upper_bounds,
915 mj_scalar_t *current_cut_lower_bounds,
916 mj_scalar_t *current_global_left_closest_points,
917 mj_scalar_t *current_global_right_closest_points,
918 mj_scalar_t * current_cut_lower_bound_weights,
919 mj_scalar_t * current_cut_upper_weights,
920 mj_scalar_t *new_current_cut_coordinates,
921 mj_scalar_t *current_part_cut_line_weight_to_put_left,
922 mj_part_t *rectilinear_cut_count,
923 mj_part_t &my_num_incomplete_cut);
934 void mj_calculate_new_cut_position (
935 mj_scalar_t cut_upper_bound,
936 mj_scalar_t cut_lower_bound,
937 mj_scalar_t cut_upper_weight,
938 mj_scalar_t cut_lower_weight,
939 mj_scalar_t expected_weight,
940 mj_scalar_t &new_cut_position);
952 void mj_create_new_partitions(
954 mj_scalar_t *mj_current_dim_coords,
955 mj_scalar_t *current_concurrent_cut_coordinate,
956 mj_lno_t coordinate_begin,
957 mj_lno_t coordinate_end,
958 mj_scalar_t *used_local_cut_line_weight_to_left,
959 double **used_thread_part_weight_work,
960 mj_lno_t *out_part_xadj);
984 bool mj_perform_migration(
985 mj_part_t in_num_parts,
986 mj_part_t &out_num_parts,
987 std::vector<mj_part_t> *next_future_num_parts_in_parts,
988 mj_part_t &output_part_begin_index,
989 size_t migration_reduce_all_population,
990 mj_lno_t num_coords_for_last_dim_part,
991 std::string iteration,
992 RCP<mj_partBoxVector_t> &input_part_boxes,
993 RCP<mj_partBoxVector_t> &output_part_boxes);
1004 void get_processor_num_points_in_parts(
1005 mj_part_t num_procs,
1006 mj_part_t num_parts,
1007 mj_gno_t *&num_points_in_all_processor_parts);
1021 bool mj_check_to_migrate(
1022 size_t migration_reduce_all_population,
1023 mj_lno_t num_coords_for_last_dim_part,
1024 mj_part_t num_procs,
1025 mj_part_t num_parts,
1026 mj_gno_t *num_points_in_all_processor_parts);
1046 void mj_migration_part_proc_assignment(
1047 mj_gno_t * num_points_in_all_processor_parts,
1048 mj_part_t num_parts,
1049 mj_part_t num_procs,
1050 mj_lno_t *send_count_to_each_proc,
1051 std::vector<mj_part_t> &processor_ranks_for_subcomm,
1052 std::vector<mj_part_t> *next_future_num_parts_in_parts,
1053 mj_part_t &out_num_part,
1054 std::vector<mj_part_t> &out_part_indices,
1055 mj_part_t &output_part_numbering_begin_index,
1056 int *coordinate_destinations);
1074 void mj_assign_proc_to_parts(
1075 mj_gno_t * num_points_in_all_processor_parts,
1076 mj_part_t num_parts,
1077 mj_part_t num_procs,
1078 mj_lno_t *send_count_to_each_proc,
1079 std::vector<mj_part_t> &processor_ranks_for_subcomm,
1080 std::vector<mj_part_t> *next_future_num_parts_in_parts,
1081 mj_part_t &out_part_index,
1082 mj_part_t &output_part_numbering_begin_index,
1083 int *coordinate_destinations);
1095 void assign_send_destinations(
1096 mj_part_t num_parts,
1097 mj_part_t *part_assignment_proc_begin_indices,
1098 mj_part_t *processor_chains_in_parts,
1099 mj_lno_t *send_count_to_each_proc,
1100 int *coordinate_destinations);
1114 void assign_send_destinations2(
1115 mj_part_t num_parts,
1117 int *coordinate_destinations,
1118 mj_part_t &output_part_numbering_begin_index,
1119 std::vector<mj_part_t> *next_future_num_parts_in_parts);
1137 void mj_assign_parts_to_procs(
1138 mj_gno_t * num_points_in_all_processor_parts,
1139 mj_part_t num_parts,
1140 mj_part_t num_procs,
1141 mj_lno_t *send_count_to_each_proc,
1142 std::vector<mj_part_t> *next_future_num_parts_in_parts,
1143 mj_part_t &out_num_part,
1144 std::vector<mj_part_t> &out_part_indices,
1145 mj_part_t &output_part_numbering_begin_index,
1146 int *coordinate_destinations);
1160 void mj_migrate_coords(
1161 mj_part_t num_procs,
1162 mj_lno_t &num_new_local_points,
1163 std::string iteration,
1164 int *coordinate_destinations,
1165 mj_part_t num_parts);
1173 void create_sub_communicator(std::vector<mj_part_t> &processor_ranks_for_subcomm);
1181 void fill_permutation_array(
1182 mj_part_t output_num_parts,
1183 mj_part_t num_parts);
1193 void set_final_parts(
1194 mj_part_t current_num_parts,
1195 mj_part_t output_part_begin_index,
1196 RCP<mj_partBoxVector_t> &output_part_boxes,
1197 bool is_data_ever_migrated);
1200 void free_work_memory();
1214 void create_consistent_chunks(
1215 mj_part_t num_parts,
1216 mj_scalar_t *mj_current_dim_coords,
1217 mj_scalar_t *current_concurrent_cut_coordinate,
1218 mj_lno_t coordinate_begin,
1219 mj_lno_t coordinate_end,
1220 mj_scalar_t *used_local_cut_line_weight_to_left,
1221 mj_lno_t *out_part_xadj,
1228 mj_part_t find_largest_prime_factor(mj_part_t num_parts){
1229 mj_part_t largest_factor = 1;
1230 mj_part_t n = num_parts;
1231 mj_part_t divisor = 2;
1233 while (n % divisor == 0){
1235 largest_factor = divisor;
1238 if (divisor * divisor > n){
1245 return largest_factor;
1279 const RCP<const Environment> &env,
1280 RCP<
const Comm<int> > &problemComm,
1282 double imbalance_tolerance,
1283 size_t num_global_parts,
1284 mj_part_t *part_no_array,
1285 int recursion_depth,
1288 mj_lno_t num_local_coords,
1289 mj_gno_t num_global_coords,
1290 const mj_gno_t *initial_mj_gnos,
1291 mj_scalar_t **mj_coordinates,
1293 int num_weights_per_coord,
1294 bool *mj_uniform_weights,
1295 mj_scalar_t **mj_weights,
1296 bool *mj_uniform_parts,
1297 mj_scalar_t **mj_part_sizes,
1299 mj_part_t *&result_assigned_part_ids,
1300 mj_gno_t *&result_mj_gnos
1312 bool distribute_points_on_cut_lines_,
1313 int max_concurrent_part_calculation_,
1314 int check_migrate_avoid_migration_option_,
1315 double minimum_migration_imbalance_,
int migration_type_ = 0);
1328 RCP<mj_partBoxVector_t> &localPartBoxes)
const;
1355 const RCP<const Environment> &env,
1356 mj_lno_t num_total_coords,
1357 mj_lno_t num_selected_coords,
1358 size_t num_target_part,
1360 mj_scalar_t **mj_coordinates,
1361 mj_lno_t *initial_selected_coords_output_permutation,
1362 mj_lno_t *output_xadj,
1363 int recursion_depth,
1364 const mj_part_t *part_no_array,
1365 bool partition_along_longest_dim,
1366 int num_ranks_per_node,
1367 bool divide_to_prime_first_);
1395 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
1398 const RCP<const Environment> &env,
1399 mj_lno_t num_total_coords,
1400 mj_lno_t num_selected_coords,
1401 size_t num_target_part,
1403 mj_scalar_t **mj_coordinates_,
1404 mj_lno_t *inital_adjList_output_adjlist,
1405 mj_lno_t *output_xadj,
1407 const mj_part_t *part_no_array_,
1408 bool partition_along_longest_dim,
1409 int num_ranks_per_node,
1410 bool divide_to_prime_first_
1415 const RCP<Comm<int> > commN;
1416 this->mj_problemComm =
1417 Teuchos::DefaultComm<int>::getDefaultSerialComm(commN);
1419 Teuchos::rcp_const_cast<Comm<int> >(this->mj_problemComm);
1420 this->myActualRank = this->myRank = 1;
1422 #ifdef HAVE_ZOLTAN2_OMP
1427 this->divide_to_prime_first = divide_to_prime_first_;
1432 this->imbalance_tolerance = 0;
1433 this->num_global_parts = num_target_part;
1434 this->part_no_array = (mj_part_t *)part_no_array_;
1435 this->recursion_depth = rd;
1437 this->coord_dim = coord_dim_;
1438 this->num_local_coords = num_total_coords;
1439 this->num_global_coords = num_total_coords;
1440 this->mj_coordinates = mj_coordinates_;
1444 this->initial_mj_gnos = allocMemory<mj_gno_t>(this->num_local_coords);
1446 this->num_weights_per_coord = 0;
1447 bool *tmp_mj_uniform_weights =
new bool[1];
1448 this->mj_uniform_weights = tmp_mj_uniform_weights ;
1449 this->mj_uniform_weights[0] =
true;
1451 mj_scalar_t **tmp_mj_weights =
new mj_scalar_t *[1];
1452 this->mj_weights = tmp_mj_weights;
1454 bool *tmp_mj_uniform_parts =
new bool[1];
1455 this->mj_uniform_parts = tmp_mj_uniform_parts;
1456 this->mj_uniform_parts[0] =
true;
1458 mj_scalar_t **tmp_mj_part_sizes =
new mj_scalar_t * [1];
1459 this->mj_part_sizes = tmp_mj_part_sizes;
1460 this->mj_part_sizes[0] = NULL;
1462 this->num_threads = 1;
1463 this->set_part_specifications();
1465 this->allocate_set_work_memory();
1467 this->part_xadj[0] =
static_cast<mj_lno_t
>(num_selected_coords);
1468 for(
size_t i = 0; i < static_cast<size_t>(num_total_coords); ++i){
1469 this->coordinate_permutations[i] = inital_adjList_output_adjlist[i];
1472 mj_part_t current_num_parts = 1;
1474 mj_scalar_t *current_cut_coordinates = this->all_cut_coordinates;
1476 mj_part_t future_num_parts = this->total_num_part;
1478 std::vector<mj_part_t> *future_num_part_in_parts =
new std::vector<mj_part_t> ();
1479 std::vector<mj_part_t> *next_future_num_parts_in_parts =
new std::vector<mj_part_t> ();
1480 next_future_num_parts_in_parts->push_back(this->num_global_parts);
1481 RCP<mj_partBoxVector_t> t1;
1482 RCP<mj_partBoxVector_t> t2;
1485 std::vector <uSignedSortItem<int, mj_scalar_t, char> > coord_dimension_range_sorted(this->coord_dim);
1487 std::vector <mj_scalar_t> coord_dim_mins(this->coord_dim);
1488 std::vector <mj_scalar_t> coord_dim_maxs(this->coord_dim);
1490 for (
int i = 0; i < this->recursion_depth; ++i){
1494 std::vector <mj_part_t> num_partitioning_in_current_dim;
1508 std::vector<mj_part_t> *tmpPartVect= future_num_part_in_parts;
1509 future_num_part_in_parts = next_future_num_parts_in_parts;
1510 next_future_num_parts_in_parts = tmpPartVect;
1515 next_future_num_parts_in_parts->clear();
1519 mj_part_t output_part_count_in_dimension =
1520 this->update_part_num_arrays(
1521 num_partitioning_in_current_dim,
1522 future_num_part_in_parts,
1523 next_future_num_parts_in_parts,
1528 t2, num_ranks_per_node);
1533 if(output_part_count_in_dimension == current_num_parts) {
1534 tmpPartVect= future_num_part_in_parts;
1535 future_num_part_in_parts = next_future_num_parts_in_parts;
1536 next_future_num_parts_in_parts = tmpPartVect;
1541 std::string istring = Teuchos::toString<int>(i);
1545 this->new_part_xadj = allocMemory<mj_lno_t>(output_part_count_in_dimension);
1548 mj_part_t output_part_index = 0;
1551 mj_part_t output_coordinate_end_index = 0;
1553 mj_part_t current_work_part = 0;
1554 mj_part_t current_concurrent_num_parts = 1;
1556 mj_part_t obtained_part_index = 0;
1559 int coordInd = i % this->coord_dim;
1560 mj_scalar_t * mj_current_dim_coords = this->mj_coordinates[coordInd];
1564 for (; current_work_part < current_num_parts;
1565 current_work_part += current_concurrent_num_parts){
1571 mj_part_t actual_work_part_count = 0;
1575 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk){
1576 mj_part_t current_work_part_in_concurrent_parts = current_work_part + kk;
1580 if (num_partitioning_in_current_dim[current_work_part_in_concurrent_parts] == 1){
1583 ++actual_work_part_count;
1584 mj_lno_t coordinate_end_index= this->part_xadj[current_work_part_in_concurrent_parts];
1585 mj_lno_t coordinate_begin_index = current_work_part_in_concurrent_parts==0 ? 0: this->part_xadj[current_work_part_in_concurrent_parts -1];
1595 if(partition_along_longest_dim){
1597 mj_scalar_t best_weight_coord = 0;
1598 for (
int coord_traverse_ind = 0; coord_traverse_ind < this->coord_dim; ++coord_traverse_ind){
1599 mj_scalar_t best_min_coord = 0;
1600 mj_scalar_t best_max_coord = 0;
1603 this->mj_get_local_min_max_coord_totW(
1604 coordinate_begin_index,
1605 coordinate_end_index,
1606 this->coordinate_permutations,
1607 this->mj_coordinates[coord_traverse_ind],
1613 coord_dim_mins[coord_traverse_ind] = best_min_coord;
1614 coord_dim_maxs[coord_traverse_ind] = best_max_coord;
1615 mj_scalar_t best_range = best_max_coord - best_min_coord;
1616 coord_dimension_range_sorted[coord_traverse_ind].id = coord_traverse_ind;
1617 coord_dimension_range_sorted[coord_traverse_ind].val = best_range;
1618 coord_dimension_range_sorted[coord_traverse_ind].signbit = 1;
1622 uqSignsort(this->coord_dim, p_coord_dimension_range_sorted);
1623 coordInd = p_coord_dimension_range_sorted[this->coord_dim - 1].
id;
1634 mj_current_dim_coords = this->mj_coordinates[coordInd];
1636 this->process_local_min_max_coord_total_weight[kk] = coord_dim_mins[coordInd];
1637 this->process_local_min_max_coord_total_weight[kk+ current_concurrent_num_parts] = coord_dim_maxs[coordInd];
1638 this->process_local_min_max_coord_total_weight[kk + 2*current_concurrent_num_parts] = best_weight_coord;
1642 this->mj_get_local_min_max_coord_totW(
1643 coordinate_begin_index,
1644 coordinate_end_index,
1645 this->coordinate_permutations,
1646 mj_current_dim_coords,
1647 this->process_local_min_max_coord_total_weight[kk],
1648 this->process_local_min_max_coord_total_weight[kk + current_concurrent_num_parts],
1649 this->process_local_min_max_coord_total_weight[kk + 2*current_concurrent_num_parts]
1655 if (actual_work_part_count > 0){
1657 this->mj_get_global_min_max_coord_totW(
1658 current_concurrent_num_parts,
1659 this->process_local_min_max_coord_total_weight,
1660 this->global_min_max_coord_total_weight);
1664 mj_part_t total_incomplete_cut_count = 0;
1669 mj_part_t concurrent_part_cut_shift = 0;
1670 mj_part_t concurrent_part_part_shift = 0;
1673 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk){
1674 mj_scalar_t min_coordinate = this->global_min_max_coord_total_weight[kk];
1675 mj_scalar_t max_coordinate = this->global_min_max_coord_total_weight[kk +
1676 current_concurrent_num_parts];
1677 mj_scalar_t global_total_weight =
1678 this->global_min_max_coord_total_weight[kk +
1679 2 * current_concurrent_num_parts];
1681 mj_part_t concurrent_current_part_index = current_work_part + kk;
1683 mj_part_t partition_count = num_partitioning_in_current_dim[concurrent_current_part_index];
1685 mj_scalar_t *usedCutCoordinate = current_cut_coordinates + concurrent_part_cut_shift;
1686 mj_scalar_t *current_target_part_weights = this->target_part_weights +
1687 concurrent_part_part_shift;
1689 concurrent_part_cut_shift += partition_count - 1;
1691 concurrent_part_part_shift += partition_count;
1695 if(partition_count > 1 && min_coordinate <= max_coordinate){
1699 total_incomplete_cut_count += partition_count - 1;
1702 this->my_incomplete_cut_count[kk] = partition_count - 1;
1705 this->mj_get_initial_cut_coords_target_weights(
1708 partition_count - 1,
1709 global_total_weight,
1711 current_target_part_weights,
1712 future_num_part_in_parts,
1713 next_future_num_parts_in_parts,
1714 concurrent_current_part_index,
1715 obtained_part_index);
1717 mj_lno_t coordinate_end_index= this->part_xadj[concurrent_current_part_index];
1718 mj_lno_t coordinate_begin_index = concurrent_current_part_index==0 ? 0: this->part_xadj[concurrent_current_part_index -1];
1721 this->set_initial_coordinate_parts(
1724 concurrent_current_part_index,
1725 coordinate_begin_index, coordinate_end_index,
1726 this->coordinate_permutations,
1727 mj_current_dim_coords,
1728 this->assigned_part_ids,
1734 this->my_incomplete_cut_count[kk] = 0;
1736 obtained_part_index += partition_count;
1740 double used_imbalance = 0;
1745 mj_current_dim_coords,
1748 current_concurrent_num_parts,
1749 current_cut_coordinates,
1750 total_incomplete_cut_count,
1751 num_partitioning_in_current_dim);
1754 obtained_part_index += current_concurrent_num_parts;
1760 mj_part_t output_array_shift = 0;
1761 mj_part_t cut_shift = 0;
1762 size_t tlr_shift = 0;
1763 size_t partweight_array_shift = 0;
1765 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk){
1766 mj_part_t current_concurrent_work_part = current_work_part + kk;
1767 mj_part_t num_parts = num_partitioning_in_current_dim[current_concurrent_work_part];
1770 if((num_parts != 1 ) && this->global_min_max_coord_total_weight[kk] >
1771 this->global_min_max_coord_total_weight[kk + current_concurrent_num_parts]) {
1773 for(mj_part_t jj = 0; jj < num_parts; ++jj){
1774 this->new_part_xadj[output_part_index + output_array_shift + jj] = 0;
1776 cut_shift += num_parts - 1;
1777 tlr_shift += (4 *(num_parts - 1) + 1);
1778 output_array_shift += num_parts;
1779 partweight_array_shift += (2 * (num_parts - 1) + 1);
1783 mj_lno_t coordinate_end = this->part_xadj[current_concurrent_work_part];
1784 mj_lno_t coordinate_begin = current_concurrent_work_part==0 ? 0: this->part_xadj[current_concurrent_work_part
1786 mj_scalar_t *current_concurrent_cut_coordinate = current_cut_coordinates + cut_shift;
1787 mj_scalar_t *used_local_cut_line_weight_to_left = this->process_cut_line_weight_to_put_left +
1790 for(
int ii = 0; ii < this->num_threads; ++ii){
1791 this->thread_part_weight_work[ii] = this->thread_part_weights[ii] + partweight_array_shift;
1796 this->create_consistent_chunks(
1798 mj_current_dim_coords,
1799 current_concurrent_cut_coordinate,
1802 used_local_cut_line_weight_to_left,
1803 this->new_part_xadj + output_part_index + output_array_shift,
1805 partition_along_longest_dim,
1806 p_coord_dimension_range_sorted);
1811 mj_lno_t part_size = coordinate_end - coordinate_begin;
1812 *(this->new_part_xadj + output_part_index + output_array_shift) = part_size;
1813 memcpy(this->new_coordinate_permutations + coordinate_begin,
1814 this->coordinate_permutations + coordinate_begin,
1815 part_size *
sizeof(mj_lno_t));
1820 cut_shift += num_parts - 1;
1821 tlr_shift += (4 *(num_parts - 1) + 1);
1822 output_array_shift += num_parts;
1823 partweight_array_shift += (2 * (num_parts - 1) + 1);
1832 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
1833 mj_part_t num_parts = num_partitioning_in_current_dim[ current_work_part + kk];
1834 for (mj_part_t ii = 0;ii < num_parts ; ++ii){
1836 this->new_part_xadj[output_part_index+ii] += output_coordinate_end_index;
1838 mj_lno_t coordinate_end = this->new_part_xadj[output_part_index+ii];
1839 mj_lno_t coordinate_begin = this->new_part_xadj[output_part_index];
1841 for (mj_lno_t task_traverse = coordinate_begin; task_traverse < coordinate_end; ++task_traverse){
1842 mj_lno_t l = this->new_coordinate_permutations[task_traverse];
1844 mj_current_dim_coords[l] = -mj_current_dim_coords[l];
1849 output_coordinate_end_index = this->new_part_xadj[output_part_index + num_parts - 1];
1851 output_part_index += num_parts ;
1858 current_num_parts = output_part_count_in_dimension;
1861 mj_lno_t * tmp = this->coordinate_permutations;
1862 this->coordinate_permutations = this->new_coordinate_permutations;
1863 this->new_coordinate_permutations = tmp;
1865 freeArray<mj_lno_t>(this->part_xadj);
1866 this->part_xadj = this->new_part_xadj;
1867 this->new_part_xadj = NULL;
1870 for(mj_lno_t i = 0; i < num_total_coords; ++i){
1871 inital_adjList_output_adjlist[i] = this->coordinate_permutations[i];
1876 for(
size_t i = 0; i < this->num_global_parts ; ++i){
1877 output_xadj[i+1] = this->part_xadj[i];
1880 delete future_num_part_in_parts;
1881 delete next_future_num_parts_in_parts;
1884 freeArray<mj_part_t>(this->assigned_part_ids);
1885 freeArray<mj_gno_t>(this->initial_mj_gnos);
1886 freeArray<mj_gno_t>(this->current_mj_gnos);
1887 freeArray<bool>(tmp_mj_uniform_weights);
1888 freeArray<bool>(tmp_mj_uniform_parts);
1889 freeArray<mj_scalar_t *>(tmp_mj_weights);
1890 freeArray<mj_scalar_t *>(tmp_mj_part_sizes);
1892 this->free_work_memory();
1894 #ifdef HAVE_ZOLTAN2_OMP
1902 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
1905 mj_env(), mj_problemComm(), imbalance_tolerance(0),
1906 part_no_array(NULL), recursion_depth(0), coord_dim(0),
1907 num_weights_per_coord(0), initial_num_loc_coords(0),
1908 initial_num_glob_coords(0),
1909 num_local_coords(0), num_global_coords(0), mj_coordinates(NULL),
1910 mj_weights(NULL), mj_uniform_parts(NULL), mj_part_sizes(NULL),
1911 mj_uniform_weights(NULL), mj_gnos(), num_global_parts(1),
1912 initial_mj_gnos(NULL), current_mj_gnos(NULL), owner_of_coordinate(NULL),
1913 coordinate_permutations(NULL), new_coordinate_permutations(NULL),
1914 assigned_part_ids(NULL), part_xadj(NULL), new_part_xadj(NULL),
1915 distribute_points_on_cut_lines(true), max_concurrent_part_calculation(1),
1916 mj_run_as_rcb(false), mj_user_recursion_depth(0), mj_keep_part_boxes(false),
1917 check_migrate_avoid_migration_option(0), migration_type(0), minimum_migration_imbalance(0.30),
1918 num_threads(1), total_num_cut(0), total_num_part(0), max_num_part_along_dim(0),
1919 max_num_cut_along_dim(0), max_num_total_part_along_dim(0), total_dim_num_reduce_all(0),
1920 last_dim_num_part(0), comm(), fEpsilon(0), sEpsilon(0), maxScalar_t(0), minScalar_t(0),
1921 all_cut_coordinates(NULL), max_min_coords(NULL), process_cut_line_weight_to_put_left(NULL),
1922 thread_cut_line_weight_to_put_left(NULL), cut_coordinates_work_array(NULL),
1923 target_part_weights(NULL), cut_upper_bound_coordinates(NULL), cut_lower_bound_coordinates(NULL),
1924 cut_lower_bound_weights(NULL), cut_upper_bound_weights(NULL),
1925 process_local_min_max_coord_total_weight(NULL), global_min_max_coord_total_weight(NULL),
1926 is_cut_line_determined(NULL), my_incomplete_cut_count(NULL),
1927 thread_part_weights(NULL), thread_part_weight_work(NULL),
1928 thread_cut_left_closest_point(NULL), thread_cut_right_closest_point(NULL),
1929 thread_point_counts(NULL), process_rectilinear_cut_weight(NULL),
1930 global_rectilinear_cut_weight(NULL),total_part_weight_left_right_closests(NULL),
1931 global_total_part_weight_left_right_closests(NULL),
1932 kept_boxes(),global_box(),
1933 myRank(0), myActualRank(0), divide_to_prime_first(false)
1938 this->maxScalar_t = std::numeric_limits<mj_scalar_t>::max();
1939 this->minScalar_t = -std::numeric_limits<mj_scalar_t>::max();
1947 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
1949 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::mj_partBox_t>
1952 return this->global_box;
1958 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
1961 this->mj_keep_part_boxes =
true;
1972 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
1976 this->total_num_cut = 0;
1977 this->total_num_part = 1;
1978 this->max_num_part_along_dim = 0;
1979 this->total_dim_num_reduce_all = 0;
1980 this->last_dim_num_part = 1;
1983 this->max_num_cut_along_dim = 0;
1984 this->max_num_total_part_along_dim = 0;
1986 if (this->part_no_array){
1988 for (
int i = 0; i < this->recursion_depth; ++i){
1989 this->total_dim_num_reduce_all += this->total_num_part;
1990 this->total_num_part *= this->part_no_array[i];
1991 if(this->part_no_array[i] > this->max_num_part_along_dim) {
1992 this->max_num_part_along_dim = this->part_no_array[i];
1995 this->last_dim_num_part = this->total_num_part / this->part_no_array[recursion_depth-1];
1996 this->num_global_parts = this->total_num_part;
1998 mj_part_t future_num_parts = this->num_global_parts;
2001 for (
int i = 0; i < this->recursion_depth; ++i){
2003 mj_part_t maxNoPartAlongI = this->get_part_count(
2004 future_num_parts, 1.0f / (this->recursion_depth - i));
2006 if (maxNoPartAlongI > this->max_num_part_along_dim){
2007 this->max_num_part_along_dim = maxNoPartAlongI;
2010 mj_part_t nfutureNumParts = future_num_parts / maxNoPartAlongI;
2011 if (future_num_parts % maxNoPartAlongI){
2014 future_num_parts = nfutureNumParts;
2016 this->total_num_part = this->num_global_parts;
2018 if (this->divide_to_prime_first){
2019 this->total_dim_num_reduce_all = this->num_global_parts * 2;
2020 this->last_dim_num_part = this->num_global_parts;
2029 for (
int i = 0; i < this->recursion_depth; ++i){
2030 this->total_dim_num_reduce_all += p;
2031 p *= this->max_num_part_along_dim;
2034 if (p / this->max_num_part_along_dim > this->num_global_parts){
2035 this->last_dim_num_part = this->num_global_parts;
2038 this->last_dim_num_part = p / this->max_num_part_along_dim;
2044 this->total_num_cut = this->total_num_part - 1;
2045 this->max_num_cut_along_dim = this->max_num_part_along_dim - 1;
2046 this->max_num_total_part_along_dim = this->max_num_part_along_dim + size_t(this->max_num_cut_along_dim);
2050 if(this->max_concurrent_part_calculation > this->last_dim_num_part){
2051 if(this->mj_problemComm->getRank() == 0){
2052 std::cerr <<
"Warning: Concurrent part count ("<< this->max_concurrent_part_calculation <<
2053 ") has been set bigger than maximum amount that can be used." <<
2054 " Setting to:" << this->last_dim_num_part <<
"." << std::endl;
2056 this->max_concurrent_part_calculation = this->last_dim_num_part;
2065 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2067 inline mj_part_t AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::get_part_count(
2068 mj_part_t num_total_future,
2071 double fp = pow(num_total_future, root);
2072 mj_part_t ip = mj_part_t (fp);
2073 if (fp - ip < this->fEpsilon * 100){
2095 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2097 mj_part_t AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::update_part_num_arrays(
2098 std::vector <mj_part_t> &num_partitioning_in_current_dim,
2099 std::vector<mj_part_t> *future_num_part_in_parts,
2100 std::vector<mj_part_t> *next_future_num_parts_in_parts,
2101 mj_part_t &future_num_parts,
2102 mj_part_t current_num_parts,
2103 int current_iteration,
2104 RCP<mj_partBoxVector_t> input_part_boxes,
2105 RCP<mj_partBoxVector_t> output_part_boxes,
2106 mj_part_t atomic_part_count
2109 mj_part_t output_num_parts = 0;
2110 if(this->part_no_array){
2115 mj_part_t p = this->part_no_array[current_iteration];
2117 std::cout <<
"i:" << current_iteration <<
" p is given as:" << p << std::endl;
2121 return current_num_parts;
2124 for (mj_part_t ii = 0; ii < current_num_parts; ++ii){
2125 num_partitioning_in_current_dim.push_back(p);
2138 future_num_parts /= num_partitioning_in_current_dim[0];
2139 output_num_parts = current_num_parts * num_partitioning_in_current_dim[0];
2141 if (this->mj_keep_part_boxes){
2142 for (mj_part_t k = 0; k < current_num_parts; ++k){
2144 for (mj_part_t j = 0; j < num_partitioning_in_current_dim[0]; ++j){
2145 output_part_boxes->push_back((*input_part_boxes)[k]);
2153 for (mj_part_t ii = 0; ii < output_num_parts; ++ii){
2154 next_future_num_parts_in_parts->push_back(future_num_parts);
2164 future_num_parts = 1;
2168 for (mj_part_t ii = 0; ii < current_num_parts; ++ii){
2170 mj_part_t future_num_parts_of_part_ii = (*future_num_part_in_parts)[ii];
2174 mj_part_t num_partitions_in_current_dim =
2175 this->get_part_count(
2176 future_num_parts_of_part_ii,
2177 1.0 / (this->recursion_depth - current_iteration)
2180 if (num_partitions_in_current_dim > this->max_num_part_along_dim){
2181 std::cerr <<
"ERROR: maxPartNo calculation is wrong. num_partitions_in_current_dim: "
2182 << num_partitions_in_current_dim <<
"this->max_num_part_along_dim:"
2183 << this->max_num_part_along_dim <<
2184 " this->recursion_depth:" << this->recursion_depth <<
2185 " current_iteration:" << current_iteration <<
2186 " future_num_parts_of_part_ii:" << future_num_parts_of_part_ii <<
2187 " might need to fix max part no calculation for largest_prime_first partitioning" <<
2192 num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
2194 mj_part_t largest_prime_factor = num_partitions_in_current_dim;
2195 if (this->divide_to_prime_first){
2198 output_num_parts += num_partitions_in_current_dim;
2199 if (future_num_parts_of_part_ii == atomic_part_count || future_num_parts_of_part_ii % atomic_part_count != 0){
2200 atomic_part_count = 1;
2203 largest_prime_factor = this->find_largest_prime_factor(future_num_parts_of_part_ii / atomic_part_count);
2208 if (largest_prime_factor < num_partitions_in_current_dim){
2209 largest_prime_factor = num_partitions_in_current_dim;
2213 mj_part_t ideal_num_future_parts_in_part = (future_num_parts_of_part_ii / atomic_part_count) / largest_prime_factor;
2215 mj_part_t ideal_prime_scale = largest_prime_factor / num_partitions_in_current_dim;
2218 for (mj_part_t iii = 0; iii < num_partitions_in_current_dim; ++iii){
2220 mj_part_t my_ideal_primescale = ideal_prime_scale;
2222 if (iii < (largest_prime_factor) % num_partitions_in_current_dim){
2223 ++my_ideal_primescale;
2226 mj_part_t num_future_parts_for_part_iii = ideal_num_future_parts_in_part * my_ideal_primescale;
2229 if (iii < (future_num_parts_of_part_ii / atomic_part_count) % largest_prime_factor){
2231 ++num_future_parts_for_part_iii;
2234 next_future_num_parts_in_parts->push_back(num_future_parts_for_part_iii * atomic_part_count);
2237 if (this->mj_keep_part_boxes){
2238 output_part_boxes->push_back((*input_part_boxes)[ii]);
2242 if (num_future_parts_for_part_iii > future_num_parts) future_num_parts = num_future_parts_for_part_iii;
2251 output_num_parts += num_partitions_in_current_dim;
2255 if (future_num_parts_of_part_ii == atomic_part_count || future_num_parts_of_part_ii % atomic_part_count != 0){
2256 atomic_part_count = 1;
2259 mj_part_t ideal_num_future_parts_in_part = (future_num_parts_of_part_ii / atomic_part_count) / num_partitions_in_current_dim;
2260 for (mj_part_t iii = 0; iii < num_partitions_in_current_dim; ++iii){
2261 mj_part_t num_future_parts_for_part_iii = ideal_num_future_parts_in_part;
2264 if (iii < (future_num_parts_of_part_ii / atomic_part_count) % num_partitions_in_current_dim){
2266 ++num_future_parts_for_part_iii;
2269 next_future_num_parts_in_parts->push_back(num_future_parts_for_part_iii * atomic_part_count);
2272 if (this->mj_keep_part_boxes){
2273 output_part_boxes->push_back((*input_part_boxes)[ii]);
2277 if (num_future_parts_for_part_iii > future_num_parts) future_num_parts = num_future_parts_for_part_iii;
2282 return output_num_parts;
2289 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2291 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::allocate_set_work_memory(){
2294 this->owner_of_coordinate = NULL;
2299 this->coordinate_permutations = allocMemory< mj_lno_t>(this->num_local_coords);
2301 #ifdef HAVE_ZOLTAN2_OMP
2302 #pragma omp parallel for
2304 for(mj_lno_t i = 0; i < this->num_local_coords; ++i){
2305 this->coordinate_permutations[i] = i;
2309 this->new_coordinate_permutations = allocMemory< mj_lno_t>(this->num_local_coords);
2311 this->assigned_part_ids = NULL;
2312 if(this->num_local_coords > 0){
2313 this->assigned_part_ids = allocMemory<mj_part_t>(this->num_local_coords);
2319 this->part_xadj = allocMemory<mj_lno_t>(1);
2320 this->part_xadj[0] =
static_cast<mj_lno_t
>(this->num_local_coords);
2322 this->new_part_xadj = NULL;
2328 this->all_cut_coordinates = allocMemory< mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2330 this->max_min_coords = allocMemory< mj_scalar_t>(this->num_threads * 2);
2332 this->process_cut_line_weight_to_put_left = NULL;
2333 this->thread_cut_line_weight_to_put_left = NULL;
2335 if(this->distribute_points_on_cut_lines){
2336 this->process_cut_line_weight_to_put_left = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2337 this->thread_cut_line_weight_to_put_left = allocMemory<mj_scalar_t *>(this->num_threads);
2338 for(
int i = 0; i < this->num_threads; ++i){
2339 this->thread_cut_line_weight_to_put_left[i] = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim);
2341 this->process_rectilinear_cut_weight = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim);
2342 this->global_rectilinear_cut_weight = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim);
2350 this->cut_coordinates_work_array = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim *
2351 this->max_concurrent_part_calculation);
2355 this->target_part_weights = allocMemory<mj_scalar_t>(
2356 this->max_num_part_along_dim * this->max_concurrent_part_calculation);
2359 this->cut_upper_bound_coordinates = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2360 this->cut_lower_bound_coordinates = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2361 this->cut_lower_bound_weights = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2362 this->cut_upper_bound_weights = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2364 this->process_local_min_max_coord_total_weight = allocMemory<mj_scalar_t>(3 * this->max_concurrent_part_calculation);
2365 this->global_min_max_coord_total_weight = allocMemory<mj_scalar_t>(3 * this->max_concurrent_part_calculation);
2369 this->is_cut_line_determined = allocMemory<bool>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2372 this->my_incomplete_cut_count = allocMemory<mj_part_t>(this->max_concurrent_part_calculation);
2374 this->thread_part_weights = allocMemory<double *>(this->num_threads);
2376 this->thread_part_weight_work = allocMemory<double *>(this->num_threads);
2379 this->thread_cut_left_closest_point = allocMemory<mj_scalar_t *>(this->num_threads);
2381 this->thread_cut_right_closest_point = allocMemory<mj_scalar_t *>(this->num_threads);
2384 this->thread_point_counts = allocMemory<mj_lno_t *>(this->num_threads);
2386 for(
int i = 0; i < this->num_threads; ++i){
2388 this->thread_part_weights[i] = allocMemory < double >(this->max_num_total_part_along_dim * this->max_concurrent_part_calculation);
2389 this->thread_cut_right_closest_point[i] = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2390 this->thread_cut_left_closest_point[i] = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2391 this->thread_point_counts[i] = allocMemory<mj_lno_t>(this->max_num_part_along_dim);
2397 this->total_part_weight_left_right_closests = allocMemory<mj_scalar_t>((this->max_num_total_part_along_dim + this->max_num_cut_along_dim * 2) * this->max_concurrent_part_calculation);
2398 this->global_total_part_weight_left_right_closests = allocMemory<mj_scalar_t>((this->max_num_total_part_along_dim + this->max_num_cut_along_dim * 2) * this->max_concurrent_part_calculation);
2401 mj_scalar_t **coord = allocMemory<mj_scalar_t *>(this->coord_dim);
2402 for (
int i=0; i < this->coord_dim; i++){
2403 coord[i] = allocMemory<mj_scalar_t>(this->num_local_coords);
2404 #ifdef HAVE_ZOLTAN2_OMP
2405 #pragma omp parallel for
2407 for (mj_lno_t j=0; j < this->num_local_coords; j++)
2408 coord[i][j] = this->mj_coordinates[i][j];
2410 this->mj_coordinates = coord;
2413 int criteria_dim = (this->num_weights_per_coord ? this->num_weights_per_coord : 1);
2414 mj_scalar_t **
weights = allocMemory<mj_scalar_t *>(criteria_dim);
2416 for (
int i=0; i < criteria_dim; i++){
2419 for (
int i=0; i < this->num_weights_per_coord; i++){
2420 weights[i] = allocMemory<mj_scalar_t>(this->num_local_coords);
2421 #ifdef HAVE_ZOLTAN2_OMP
2422 #pragma omp parallel for
2424 for (mj_lno_t j=0; j < this->num_local_coords; j++)
2425 weights[i][j] = this->mj_weights[i][j];
2429 this->current_mj_gnos = allocMemory<mj_gno_t>(this->num_local_coords);
2430 #ifdef HAVE_ZOLTAN2_OMP
2431 #pragma omp parallel for
2433 for (mj_lno_t j=0; j < this->num_local_coords; j++)
2434 this->current_mj_gnos[j] = this->initial_mj_gnos[j];
2436 this->owner_of_coordinate = allocMemory<int>(this->num_local_coords);
2438 #ifdef HAVE_ZOLTAN2_OMP
2439 #pragma omp parallel for
2441 for (mj_lno_t j=0; j < this->num_local_coords; j++)
2442 this->owner_of_coordinate[j] = this->myActualRank;
2447 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2449 void AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::compute_global_box()
2452 mj_scalar_t *mins = allocMemory<mj_scalar_t>(this->coord_dim);
2454 mj_scalar_t *gmins = allocMemory<mj_scalar_t>(this->coord_dim);
2456 mj_scalar_t *maxs = allocMemory<mj_scalar_t>(this->coord_dim);
2458 mj_scalar_t *gmaxs = allocMemory<mj_scalar_t>(this->coord_dim);
2460 for (
int i = 0; i < this->coord_dim; ++i){
2461 mj_scalar_t localMin = std::numeric_limits<mj_scalar_t>::max();
2462 mj_scalar_t localMax = -localMin;
2463 if (localMax > 0) localMax = 0;
2466 for (mj_lno_t j = 0; j < this->num_local_coords; ++j){
2467 if (this->mj_coordinates[i][j] < localMin){
2468 localMin = this->mj_coordinates[i][j];
2470 if (this->mj_coordinates[i][j] > localMax){
2471 localMax = this->mj_coordinates[i][j];
2480 reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MIN,
2481 this->coord_dim, mins, gmins
2485 reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MAX,
2486 this->coord_dim, maxs, gmaxs
2492 global_box = rcp(
new mj_partBox_t(0,this->coord_dim,gmins,gmaxs));
2494 freeArray<mj_scalar_t>(mins);
2495 freeArray<mj_scalar_t>(gmins);
2496 freeArray<mj_scalar_t>(maxs);
2497 freeArray<mj_scalar_t>(gmaxs);
2505 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2507 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::init_part_boxes(
2508 RCP<mj_partBoxVector_t> & initial_partitioning_boxes
2511 mj_partBox_t tmp_box(*global_box);
2512 initial_partitioning_boxes->push_back(tmp_box);
2525 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2527 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_get_local_min_max_coord_totW(
2528 mj_lno_t coordinate_begin_index,
2529 mj_lno_t coordinate_end_index,
2530 mj_lno_t *mj_current_coordinate_permutations,
2531 mj_scalar_t *mj_current_dim_coords,
2532 mj_scalar_t &min_coordinate,
2533 mj_scalar_t &max_coordinate,
2534 mj_scalar_t &total_weight){
2538 if(coordinate_begin_index >= coordinate_end_index)
2540 min_coordinate = this->maxScalar_t;
2541 max_coordinate = this->minScalar_t;
2545 mj_scalar_t my_total_weight = 0;
2546 #ifdef HAVE_ZOLTAN2_OMP
2547 #pragma omp parallel num_threads(this->num_threads)
2551 if (this->mj_uniform_weights[0]) {
2552 #ifdef HAVE_ZOLTAN2_OMP
2556 my_total_weight = coordinate_end_index - coordinate_begin_index;
2562 #ifdef HAVE_ZOLTAN2_OMP
2563 #pragma omp for reduction(+:my_total_weight)
2565 for (mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
2566 int i = mj_current_coordinate_permutations[ii];
2567 my_total_weight += this->mj_weights[0][i];
2571 int my_thread_id = 0;
2572 #ifdef HAVE_ZOLTAN2_OMP
2573 my_thread_id = omp_get_thread_num();
2575 mj_scalar_t my_thread_min_coord, my_thread_max_coord;
2576 my_thread_min_coord=my_thread_max_coord
2577 =mj_current_dim_coords[mj_current_coordinate_permutations[coordinate_begin_index]];
2580 #ifdef HAVE_ZOLTAN2_OMP
2583 for(mj_lno_t j = coordinate_begin_index + 1; j < coordinate_end_index; ++j){
2584 int i = mj_current_coordinate_permutations[j];
2585 if(mj_current_dim_coords[i] > my_thread_max_coord)
2586 my_thread_max_coord = mj_current_dim_coords[i];
2587 if(mj_current_dim_coords[i] < my_thread_min_coord)
2588 my_thread_min_coord = mj_current_dim_coords[i];
2590 this->max_min_coords[my_thread_id] = my_thread_min_coord;
2591 this->max_min_coords[my_thread_id + this->num_threads] = my_thread_max_coord;
2593 #ifdef HAVE_ZOLTAN2_OMP
2596 #pragma omp single nowait
2599 min_coordinate = this->max_min_coords[0];
2600 for(
int i = 1; i < this->num_threads; ++i){
2601 if(this->max_min_coords[i] < min_coordinate)
2602 min_coordinate = this->max_min_coords[i];
2606 #ifdef HAVE_ZOLTAN2_OMP
2607 #pragma omp single nowait
2610 max_coordinate = this->max_min_coords[this->num_threads];
2611 for(
int i = this->num_threads + 1; i < this->num_threads * 2; ++i){
2612 if(this->max_min_coords[i] > max_coordinate)
2613 max_coordinate = this->max_min_coords[i];
2617 total_weight = my_total_weight;
2629 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2631 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_get_global_min_max_coord_totW(
2632 mj_part_t current_concurrent_num_parts,
2633 mj_scalar_t *local_min_max_total,
2634 mj_scalar_t *global_min_max_total){
2639 if(this->comm->getSize() > 1){
2642 current_concurrent_num_parts,
2643 current_concurrent_num_parts,
2644 current_concurrent_num_parts);
2646 reduceAll<int, mj_scalar_t>(
2649 3 * current_concurrent_num_parts,
2650 local_min_max_total,
2651 global_min_max_total);
2656 mj_part_t s = 3 * current_concurrent_num_parts;
2657 for (mj_part_t i = 0; i < s; ++i){
2658 global_min_max_total[i] = local_min_max_total[i];
2683 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2685 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_get_initial_cut_coords_target_weights(
2686 mj_scalar_t min_coord,
2687 mj_scalar_t max_coord,
2688 mj_part_t num_cuts ,
2689 mj_scalar_t global_weight,
2690 mj_scalar_t *initial_cut_coords ,
2691 mj_scalar_t *current_target_part_weights ,
2693 std::vector <mj_part_t> *future_num_part_in_parts,
2694 std::vector <mj_part_t> *next_future_num_parts_in_parts,
2695 mj_part_t concurrent_current_part,
2696 mj_part_t obtained_part_index
2699 mj_scalar_t coord_range = max_coord - min_coord;
2700 if(this->mj_uniform_parts[0]){
2702 mj_part_t cumulative = 0;
2704 mj_scalar_t total_future_part_count_in_part = mj_scalar_t((*future_num_part_in_parts)[concurrent_current_part]);
2708 mj_scalar_t unit_part_weight = global_weight / total_future_part_count_in_part;
2714 for(mj_part_t i = 0; i < num_cuts; ++i){
2715 cumulative += (*next_future_num_parts_in_parts)[i + obtained_part_index];
2723 current_target_part_weights[i] = cumulative * unit_part_weight;
2727 initial_cut_coords[i] = min_coord + (coord_range * cumulative) / total_future_part_count_in_part;
2729 current_target_part_weights[num_cuts] = 1;
2733 if (this->mj_uniform_weights[0]){
2734 for(mj_part_t i = 0; i < num_cuts + 1; ++i){
2736 current_target_part_weights[i] = long(current_target_part_weights[i] + 0.5);
2741 std::cerr <<
"MJ does not support non uniform part weights" << std::endl;
2759 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2761 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::set_initial_coordinate_parts(
2762 mj_scalar_t &max_coordinate,
2763 mj_scalar_t &min_coordinate,
2765 mj_lno_t coordinate_begin_index,
2766 mj_lno_t coordinate_end_index,
2767 mj_lno_t *mj_current_coordinate_permutations,
2768 mj_scalar_t *mj_current_dim_coords,
2769 mj_part_t *mj_part_ids,
2770 mj_part_t &partition_count
2772 mj_scalar_t coordinate_range = max_coordinate - min_coordinate;
2776 if(
ZOLTAN2_ABS(coordinate_range) < this->sEpsilon ){
2777 #ifdef HAVE_ZOLTAN2_OMP
2778 #pragma omp parallel for
2780 for(mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
2781 mj_part_ids[mj_current_coordinate_permutations[ii]] = 0;
2788 mj_scalar_t slice = coordinate_range / partition_count;
2790 #ifdef HAVE_ZOLTAN2_OMP
2791 #pragma omp parallel for
2793 for(mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
2795 mj_lno_t iii = mj_current_coordinate_permutations[ii];
2796 mj_part_t pp = mj_part_t((mj_current_dim_coords[iii] - min_coordinate) / slice);
2797 mj_part_ids[iii] = 2 * pp;
2813 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2815 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_1D_part(
2816 mj_scalar_t *mj_current_dim_coords,
2817 double used_imbalance_tolerance,
2818 mj_part_t current_work_part,
2819 mj_part_t current_concurrent_num_parts,
2820 mj_scalar_t *current_cut_coordinates,
2821 mj_part_t total_incomplete_cut_count,
2822 std::vector <mj_part_t> &num_partitioning_in_current_dim
2826 mj_part_t rectilinear_cut_count = 0;
2827 mj_scalar_t *temp_cut_coords = current_cut_coordinates;
2830 *reductionOp = NULL;
2832 <mj_part_t, mj_scalar_t>(
2833 &num_partitioning_in_current_dim ,
2835 current_concurrent_num_parts);
2837 size_t total_reduction_size = 0;
2838 #ifdef HAVE_ZOLTAN2_OMP
2839 #pragma omp parallel shared(total_incomplete_cut_count, rectilinear_cut_count) num_threads(this->num_threads)
2843 #ifdef HAVE_ZOLTAN2_OMP
2844 me = omp_get_thread_num();
2846 double *my_thread_part_weights = this->thread_part_weights[me];
2847 mj_scalar_t *my_thread_left_closest = this->thread_cut_left_closest_point[me];
2848 mj_scalar_t *my_thread_right_closest = this->thread_cut_right_closest_point[me];
2850 #ifdef HAVE_ZOLTAN2_OMP
2856 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
2858 mj_part_t num_part_in_dim = num_partitioning_in_current_dim[current_work_part + i];
2859 mj_part_t num_cut_in_dim = num_part_in_dim - 1;
2860 total_reduction_size += (4 * num_cut_in_dim + 1);
2862 for(mj_part_t ii = 0; ii < num_cut_in_dim; ++ii){
2863 this->is_cut_line_determined[next] =
false;
2864 this->cut_lower_bound_coordinates[next] = global_min_max_coord_total_weight[i];
2865 this->cut_upper_bound_coordinates[next] = global_min_max_coord_total_weight[i + current_concurrent_num_parts];
2867 this->cut_upper_bound_weights[next] = global_min_max_coord_total_weight[i + 2 * current_concurrent_num_parts];
2868 this->cut_lower_bound_weights[next] = 0;
2870 if(this->distribute_points_on_cut_lines){
2871 this->process_cut_line_weight_to_put_left[next] = 0;
2882 while (total_incomplete_cut_count != 0){
2884 mj_part_t concurrent_cut_shifts = 0;
2885 size_t total_part_shift = 0;
2887 for (mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
2888 mj_part_t num_parts = -1;
2889 num_parts = num_partitioning_in_current_dim[current_work_part + kk];
2891 mj_part_t num_cuts = num_parts - 1;
2892 size_t total_part_count = num_parts + size_t (num_cuts) ;
2893 if (this->my_incomplete_cut_count[kk] > 0){
2896 bool *current_cut_status = this->is_cut_line_determined + concurrent_cut_shifts;
2897 double *my_current_part_weights = my_thread_part_weights + total_part_shift;
2898 mj_scalar_t *my_current_left_closest = my_thread_left_closest + concurrent_cut_shifts;
2899 mj_scalar_t *my_current_right_closest = my_thread_right_closest + concurrent_cut_shifts;
2901 mj_part_t conccurent_current_part = current_work_part + kk;
2902 mj_lno_t coordinate_begin_index = conccurent_current_part ==0 ? 0: this->part_xadj[conccurent_current_part -1];
2903 mj_lno_t coordinate_end_index = this->part_xadj[conccurent_current_part];
2904 mj_scalar_t *temp_current_cut_coords = temp_cut_coords + concurrent_cut_shifts;
2906 mj_scalar_t min_coord = global_min_max_coord_total_weight[kk];
2907 mj_scalar_t max_coord = global_min_max_coord_total_weight[kk + current_concurrent_num_parts];
2910 this->mj_1D_part_get_thread_part_weights(
2915 coordinate_begin_index,
2916 coordinate_end_index,
2917 mj_current_dim_coords,
2918 temp_current_cut_coords,
2920 my_current_part_weights,
2921 my_current_left_closest,
2922 my_current_right_closest);
2926 concurrent_cut_shifts += num_cuts;
2927 total_part_shift += total_part_count;
2931 this->mj_accumulate_thread_results(
2932 num_partitioning_in_current_dim,
2934 current_concurrent_num_parts);
2937 #ifdef HAVE_ZOLTAN2_OMP
2941 if(this->comm->getSize() > 1){
2942 reduceAll<int, mj_scalar_t>( *(this->comm), *reductionOp,
2943 total_reduction_size,
2944 this->total_part_weight_left_right_closests,
2945 this->global_total_part_weight_left_right_closests);
2950 this->global_total_part_weight_left_right_closests,
2951 this->total_part_weight_left_right_closests,
2952 total_reduction_size *
sizeof(mj_scalar_t));
2957 mj_part_t cut_shift = 0;
2960 size_t tlr_shift = 0;
2961 for (mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
2962 mj_part_t num_parts = num_partitioning_in_current_dim[current_work_part + kk];
2963 mj_part_t num_cuts = num_parts - 1;
2964 size_t num_total_part = num_parts + size_t (num_cuts) ;
2969 if (this->my_incomplete_cut_count[kk] == 0) {
2970 cut_shift += num_cuts;
2971 tlr_shift += (num_total_part + 2 * num_cuts);
2975 mj_scalar_t *current_local_part_weights = this->total_part_weight_left_right_closests + tlr_shift ;
2976 mj_scalar_t *current_global_tlr = this->global_total_part_weight_left_right_closests + tlr_shift;
2977 mj_scalar_t *current_global_left_closest_points = current_global_tlr + num_total_part;
2978 mj_scalar_t *current_global_right_closest_points = current_global_tlr + num_total_part + num_cuts;
2979 mj_scalar_t *current_global_part_weights = current_global_tlr;
2980 bool *current_cut_line_determined = this->is_cut_line_determined + cut_shift;
2982 mj_scalar_t *current_part_target_weights = this->target_part_weights + cut_shift + kk;
2983 mj_scalar_t *current_part_cut_line_weight_to_put_left = this->process_cut_line_weight_to_put_left + cut_shift;
2985 mj_scalar_t min_coordinate = global_min_max_coord_total_weight[kk];
2986 mj_scalar_t max_coordinate = global_min_max_coord_total_weight[kk + current_concurrent_num_parts];
2987 mj_scalar_t global_total_weight = global_min_max_coord_total_weight[kk + current_concurrent_num_parts * 2];
2988 mj_scalar_t *current_cut_lower_bound_weights = this->cut_lower_bound_weights + cut_shift;
2989 mj_scalar_t *current_cut_upper_weights = this->cut_upper_bound_weights + cut_shift;
2990 mj_scalar_t *current_cut_upper_bounds = this->cut_upper_bound_coordinates + cut_shift;
2991 mj_scalar_t *current_cut_lower_bounds = this->cut_lower_bound_coordinates + cut_shift;
2993 mj_part_t initial_incomplete_cut_count = this->my_incomplete_cut_count[kk];
2996 this->mj_get_new_cut_coordinates(
3001 global_total_weight,
3002 used_imbalance_tolerance,
3003 current_global_part_weights,
3004 current_local_part_weights,
3005 current_part_target_weights,
3006 current_cut_line_determined,
3007 temp_cut_coords + cut_shift,
3008 current_cut_upper_bounds,
3009 current_cut_lower_bounds,
3010 current_global_left_closest_points,
3011 current_global_right_closest_points,
3012 current_cut_lower_bound_weights,
3013 current_cut_upper_weights,
3014 this->cut_coordinates_work_array +cut_shift,
3015 current_part_cut_line_weight_to_put_left,
3016 &rectilinear_cut_count,
3017 this->my_incomplete_cut_count[kk]);
3019 cut_shift += num_cuts;
3020 tlr_shift += (num_total_part + 2 * num_cuts);
3021 mj_part_t iteration_complete_cut_count = initial_incomplete_cut_count - this->my_incomplete_cut_count[kk];
3022 #ifdef HAVE_ZOLTAN2_OMP
3026 total_incomplete_cut_count -= iteration_complete_cut_count;
3031 #ifdef HAVE_ZOLTAN2_OMP
3037 mj_scalar_t *t = temp_cut_coords;
3038 temp_cut_coords = this->cut_coordinates_work_array;
3039 this->cut_coordinates_work_array = t;
3050 if (current_cut_coordinates != temp_cut_coords){
3051 #ifdef HAVE_ZOLTAN2_OMP
3056 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
3057 mj_part_t num_parts = -1;
3058 num_parts = num_partitioning_in_current_dim[current_work_part + i];
3059 mj_part_t num_cuts = num_parts - 1;
3061 for(mj_part_t ii = 0; ii < num_cuts; ++ii){
3062 current_cut_coordinates[next + ii] = temp_cut_coords[next + ii];
3068 #ifdef HAVE_ZOLTAN2_OMP
3072 this->cut_coordinates_work_array = temp_cut_coords;
3099 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3101 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_1D_part_get_thread_part_weights(
3102 size_t total_part_count,
3104 mj_scalar_t max_coord,
3105 mj_scalar_t min_coord,
3106 mj_lno_t coordinate_begin_index,
3107 mj_lno_t coordinate_end_index,
3108 mj_scalar_t *mj_current_dim_coords,
3109 mj_scalar_t *temp_current_cut_coords,
3111 double *my_current_part_weights,
3112 mj_scalar_t *my_current_left_closest,
3113 mj_scalar_t *my_current_right_closest){
3116 for (
size_t i = 0; i < total_part_count; ++i){
3117 my_current_part_weights[i] = 0;
3122 for(mj_part_t i = 0; i < num_cuts; ++i){
3123 my_current_left_closest[i] = min_coord - 1;
3124 my_current_right_closest[i] = max_coord + 1;
3127 mj_scalar_t minus_EPSILON = -this->sEpsilon;
3128 #ifdef HAVE_ZOLTAN2_OMP
3134 for (mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
3135 int i = this->coordinate_permutations[ii];
3139 mj_part_t j = this->assigned_part_ids[i] / 2;
3145 mj_part_t lower_cut_index = 0;
3146 mj_part_t upper_cut_index = num_cuts - 1;
3148 mj_scalar_t w = this->mj_uniform_weights[0]? 1:this->mj_weights[0][i];
3149 bool is_inserted =
false;
3150 bool is_on_left_of_cut =
false;
3151 bool is_on_right_of_cut =
false;
3152 mj_part_t last_compared_part = -1;
3154 mj_scalar_t coord = mj_current_dim_coords[i];
3156 while(upper_cut_index >= lower_cut_index)
3159 last_compared_part = -1;
3160 is_on_left_of_cut =
false;
3161 is_on_right_of_cut =
false;
3162 mj_scalar_t cut = temp_current_cut_coords[j];
3163 mj_scalar_t distance_to_cut = coord - cut;
3164 mj_scalar_t abs_distance_to_cut =
ZOLTAN2_ABS(distance_to_cut);
3167 if(abs_distance_to_cut < this->sEpsilon){
3169 my_current_part_weights[j * 2 + 1] += w;
3170 this->assigned_part_ids[i] = j * 2 + 1;
3173 my_current_left_closest[j] = coord;
3174 my_current_right_closest[j] = coord;
3177 mj_part_t kk = j + 1;
3178 while(kk < num_cuts){
3180 distance_to_cut =
ZOLTAN2_ABS(temp_current_cut_coords[kk] - cut);
3181 if(distance_to_cut < this->sEpsilon){
3182 my_current_part_weights[2 * kk + 1] += w;
3183 my_current_left_closest[kk] = coord;
3184 my_current_right_closest[kk] = coord;
3190 if(coord - my_current_left_closest[kk] > this->sEpsilon){
3191 my_current_left_closest[kk] = coord;
3201 distance_to_cut =
ZOLTAN2_ABS(temp_current_cut_coords[kk] - cut);
3202 if(distance_to_cut < this->sEpsilon){
3203 my_current_part_weights[2 * kk + 1] += w;
3205 this->assigned_part_ids[i] = kk * 2 + 1;
3206 my_current_left_closest[kk] = coord;
3207 my_current_right_closest[kk] = coord;
3213 if(my_current_right_closest[kk] - coord > this->sEpsilon){
3214 my_current_right_closest[kk] = coord;
3225 if (distance_to_cut < 0) {
3226 bool _break =
false;
3230 mj_scalar_t distance_to_next_cut = coord - temp_current_cut_coords[j - 1];
3231 if(distance_to_next_cut > this->sEpsilon){
3237 upper_cut_index = j - 1;
3239 is_on_left_of_cut =
true;
3240 last_compared_part = j;
3245 bool _break =
false;
3246 if(j < num_cuts - 1){
3249 mj_scalar_t distance_to_next_cut = coord - temp_current_cut_coords[j + 1];
3250 if(distance_to_next_cut < minus_EPSILON){
3257 lower_cut_index = j + 1;
3259 is_on_right_of_cut =
true;
3260 last_compared_part = j;
3265 j = (upper_cut_index + lower_cut_index) / 2;
3268 if(is_on_right_of_cut){
3271 my_current_part_weights[2 * last_compared_part + 2] += w;
3272 this->assigned_part_ids[i] = 2 * last_compared_part + 2;
3275 if(my_current_right_closest[last_compared_part] - coord > this->sEpsilon){
3276 my_current_right_closest[last_compared_part] = coord;
3279 if(last_compared_part+1 < num_cuts){
3281 if(coord - my_current_left_closest[last_compared_part + 1] > this->sEpsilon){
3282 my_current_left_closest[last_compared_part + 1] = coord;
3287 else if(is_on_left_of_cut){
3290 my_current_part_weights[2 * last_compared_part] += w;
3291 this->assigned_part_ids[i] = 2 * last_compared_part;
3295 if(coord - my_current_left_closest[last_compared_part] > this->sEpsilon){
3296 my_current_left_closest[last_compared_part] = coord;
3300 if(last_compared_part-1 >= 0){
3301 if(my_current_right_closest[last_compared_part -1] - coord > this->sEpsilon){
3302 my_current_right_closest[last_compared_part -1] = coord;
3311 for (
size_t i = 1; i < total_part_count; ++i){
3315 if(i % 2 == 0 && i > 1 && i < total_part_count - 1 &&
3316 ZOLTAN2_ABS(temp_current_cut_coords[i / 2] - temp_current_cut_coords[i /2 - 1])
3321 my_current_part_weights[i] = my_current_part_weights[i-2];
3325 my_current_part_weights[i] += my_current_part_weights[i-1];
3337 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3339 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_accumulate_thread_results(
3340 const std::vector <mj_part_t> &num_partitioning_in_current_dim,
3341 mj_part_t current_work_part,
3342 mj_part_t current_concurrent_num_parts){
3344 #ifdef HAVE_ZOLTAN2_OMP
3351 size_t tlr_array_shift = 0;
3352 mj_part_t cut_shift = 0;
3355 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
3357 mj_part_t num_parts_in_part = num_partitioning_in_current_dim[current_work_part + i];
3358 mj_part_t num_cuts_in_part = num_parts_in_part - 1;
3359 size_t num_total_part_in_part = num_parts_in_part + size_t (num_cuts_in_part) ;
3362 for(mj_part_t ii = 0; ii < num_cuts_in_part ; ++ii){
3363 mj_part_t next = tlr_array_shift + ii;
3364 mj_part_t cut_index = cut_shift + ii;
3365 if(this->is_cut_line_determined[cut_index])
continue;
3366 mj_scalar_t left_closest_in_process = this->thread_cut_left_closest_point[0][cut_index],
3367 right_closest_in_process = this->thread_cut_right_closest_point[0][cut_index];
3370 for (
int j = 1; j < this->num_threads; ++j){
3371 if (this->thread_cut_right_closest_point[j][cut_index] < right_closest_in_process ){
3372 right_closest_in_process = this->thread_cut_right_closest_point[j][cut_index];
3374 if (this->thread_cut_left_closest_point[j][cut_index] > left_closest_in_process ){
3375 left_closest_in_process = this->thread_cut_left_closest_point[j][cut_index];
3379 this->total_part_weight_left_right_closests[num_total_part_in_part +
3380 next] = left_closest_in_process;
3381 this->total_part_weight_left_right_closests[num_total_part_in_part +
3382 num_cuts_in_part + next] = right_closest_in_process;
3385 tlr_array_shift += (num_total_part_in_part + 2 * num_cuts_in_part);
3386 cut_shift += num_cuts_in_part;
3389 tlr_array_shift = 0;
3391 size_t total_part_array_shift = 0;
3394 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
3396 mj_part_t num_parts_in_part = num_partitioning_in_current_dim[current_work_part + i];
3397 mj_part_t num_cuts_in_part = num_parts_in_part - 1;
3398 size_t num_total_part_in_part = num_parts_in_part + size_t (num_cuts_in_part) ;
3400 for(
size_t j = 0; j < num_total_part_in_part; ++j){
3402 mj_part_t cut_ind = j / 2 + cut_shift;
3407 if(j != num_total_part_in_part - 1 && this->is_cut_line_determined[cut_ind])
continue;
3409 for (
int k = 0; k < this->num_threads; ++k){
3410 pwj += this->thread_part_weights[k][total_part_array_shift + j];
3413 this->total_part_weight_left_right_closests[tlr_array_shift + j] = pwj;
3415 cut_shift += num_cuts_in_part;
3416 tlr_array_shift += num_total_part_in_part + 2 * num_cuts_in_part;
3417 total_part_array_shift += num_total_part_in_part;
3435 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3437 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_calculate_new_cut_position (
3438 mj_scalar_t cut_upper_bound,
3439 mj_scalar_t cut_lower_bound,
3440 mj_scalar_t cut_upper_weight,
3441 mj_scalar_t cut_lower_weight,
3442 mj_scalar_t expected_weight,
3443 mj_scalar_t &new_cut_position){
3445 if(
ZOLTAN2_ABS(cut_upper_bound - cut_lower_bound) < this->sEpsilon){
3446 new_cut_position = cut_upper_bound;
3450 if(
ZOLTAN2_ABS(cut_upper_weight - cut_lower_weight) < this->sEpsilon){
3451 new_cut_position = cut_lower_bound;
3454 mj_scalar_t coordinate_range = (cut_upper_bound - cut_lower_bound);
3455 mj_scalar_t weight_range = (cut_upper_weight - cut_lower_weight);
3456 mj_scalar_t my_weight_diff = (expected_weight - cut_lower_weight);
3458 mj_scalar_t required_shift = (my_weight_diff / weight_range);
3459 int scale_constant = 20;
3460 int shiftint= int (required_shift * scale_constant);
3461 if (shiftint == 0) shiftint = 1;
3462 required_shift = mj_scalar_t (shiftint) / scale_constant;
3463 new_cut_position = coordinate_range * required_shift + cut_lower_bound;
3477 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3479 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_create_new_partitions(
3480 mj_part_t num_parts,
3482 mj_scalar_t *current_concurrent_cut_coordinate,
3483 mj_lno_t coordinate_begin,
3484 mj_lno_t coordinate_end,
3485 mj_scalar_t *used_local_cut_line_weight_to_left,
3486 double **used_thread_part_weight_work,
3487 mj_lno_t *out_part_xadj){
3489 mj_part_t num_cuts = num_parts - 1;
3491 #ifdef HAVE_ZOLTAN2_OMP
3492 #pragma omp parallel
3496 #ifdef HAVE_ZOLTAN2_OMP
3497 me = omp_get_thread_num();
3500 mj_lno_t *thread_num_points_in_parts = this->thread_point_counts[me];
3501 mj_scalar_t *my_local_thread_cut_weights_to_put_left = NULL;
3505 if (this->distribute_points_on_cut_lines){
3506 my_local_thread_cut_weights_to_put_left = this->thread_cut_line_weight_to_put_left[me];
3508 #ifdef HAVE_ZOLTAN2_OMP
3511 for (mj_part_t i = 0; i < num_cuts; ++i){
3513 mj_scalar_t left_weight = used_local_cut_line_weight_to_left[i];
3514 for(
int ii = 0; ii < this->num_threads; ++ii){
3515 if(left_weight > this->sEpsilon){
3517 mj_scalar_t thread_ii_weight_on_cut = used_thread_part_weight_work[ii][i * 2 + 1] - used_thread_part_weight_work[ii][i * 2 ];
3518 if(thread_ii_weight_on_cut < left_weight){
3520 this->thread_cut_line_weight_to_put_left[ii][i] = thread_ii_weight_on_cut;
3524 this->thread_cut_line_weight_to_put_left[ii][i] = left_weight ;
3526 left_weight -= thread_ii_weight_on_cut;
3529 this->thread_cut_line_weight_to_put_left[ii][i] = 0;
3537 for (mj_part_t i = num_cuts - 1; i > 0 ; --i){
3538 if(
ZOLTAN2_ABS(current_concurrent_cut_coordinate[i] - current_concurrent_cut_coordinate[i -1]) < this->sEpsilon){
3539 my_local_thread_cut_weights_to_put_left[i] -= my_local_thread_cut_weights_to_put_left[i - 1] ;
3547 for(mj_part_t ii = 0; ii < num_parts; ++ii){
3548 thread_num_points_in_parts[ii] = 0;
3552 #ifdef HAVE_ZOLTAN2_OMP
3556 for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
3558 mj_lno_t coordinate_index = this->coordinate_permutations[ii];
3559 mj_scalar_t coordinate_weight = this->mj_uniform_weights[0]? 1:this->mj_weights[0][coordinate_index];
3560 mj_part_t coordinate_assigned_place = this->assigned_part_ids[coordinate_index];
3561 mj_part_t coordinate_assigned_part = coordinate_assigned_place / 2;
3562 if(coordinate_assigned_place % 2 == 1){
3564 if(this->distribute_points_on_cut_lines
3565 && my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] > this->sEpsilon){
3569 my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] -= coordinate_weight;
3574 if(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] < 0
3575 && coordinate_assigned_part < num_cuts - 1
3576 &&
ZOLTAN2_ABS(current_concurrent_cut_coordinate[coordinate_assigned_part+1] -
3577 current_concurrent_cut_coordinate[coordinate_assigned_part]) < this->sEpsilon){
3578 my_local_thread_cut_weights_to_put_left[coordinate_assigned_part + 1] += my_local_thread_cut_weights_to_put_left[coordinate_assigned_part];
3580 ++thread_num_points_in_parts[coordinate_assigned_part];
3581 this->assigned_part_ids[coordinate_index] = coordinate_assigned_part;
3585 ++coordinate_assigned_part;
3587 while(this->distribute_points_on_cut_lines &&
3588 coordinate_assigned_part < num_cuts){
3590 if(
ZOLTAN2_ABS(current_concurrent_cut_coordinate[coordinate_assigned_part] -
3591 current_concurrent_cut_coordinate[coordinate_assigned_part - 1])
3594 if(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] >
3596 my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] >=
3597 ZOLTAN2_ABS(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] - coordinate_weight)){
3598 my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] -= coordinate_weight;
3601 if(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] < 0 &&
3602 coordinate_assigned_part < num_cuts - 1 &&
3603 ZOLTAN2_ABS(current_concurrent_cut_coordinate[coordinate_assigned_part+1] -
3604 current_concurrent_cut_coordinate[coordinate_assigned_part]) < this->sEpsilon){
3605 my_local_thread_cut_weights_to_put_left[coordinate_assigned_part + 1] += my_local_thread_cut_weights_to_put_left[coordinate_assigned_part];
3613 ++coordinate_assigned_part;
3615 ++thread_num_points_in_parts[coordinate_assigned_part];
3616 this->assigned_part_ids[coordinate_index] = coordinate_assigned_part;
3621 ++thread_num_points_in_parts[coordinate_assigned_part];
3622 this->assigned_part_ids[coordinate_index] = coordinate_assigned_part;
3631 #ifdef HAVE_ZOLTAN2_OMP
3634 for(mj_part_t j = 0; j < num_parts; ++j){
3635 mj_lno_t num_points_in_part_j_upto_thread_i = 0;
3636 for (
int i = 0; i < this->num_threads; ++i){
3637 mj_lno_t thread_num_points_in_part_j = this->thread_point_counts[i][j];
3639 this->thread_point_counts[i][j] = num_points_in_part_j_upto_thread_i;
3640 num_points_in_part_j_upto_thread_i += thread_num_points_in_part_j;
3643 out_part_xadj[j] = num_points_in_part_j_upto_thread_i;
3647 #ifdef HAVE_ZOLTAN2_OMP
3652 for(mj_part_t j = 1; j < num_parts; ++j){
3653 out_part_xadj[j] += out_part_xadj[j - 1];
3659 for(mj_part_t j = 1; j < num_parts; ++j){
3660 thread_num_points_in_parts[j] += out_part_xadj[j - 1] ;
3666 #ifdef HAVE_ZOLTAN2_OMP
3669 for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
3670 mj_lno_t i = this->coordinate_permutations[ii];
3671 mj_part_t p = this->assigned_part_ids[i];
3672 this->new_coordinate_permutations[coordinate_begin +
3673 thread_num_points_in_parts[p]++] = i;
3708 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3710 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_get_new_cut_coordinates(
3712 const mj_part_t &num_cuts,
3713 const mj_scalar_t &max_coordinate,
3714 const mj_scalar_t &min_coordinate,
3715 const mj_scalar_t &global_total_weight,
3716 const double &used_imbalance_tolerance,
3717 mj_scalar_t * current_global_part_weights,
3718 const mj_scalar_t * current_local_part_weights,
3719 const mj_scalar_t *current_part_target_weights,
3720 bool *current_cut_line_determined,
3721 mj_scalar_t *current_cut_coordinates,
3722 mj_scalar_t *current_cut_upper_bounds,
3723 mj_scalar_t *current_cut_lower_bounds,
3724 mj_scalar_t *current_global_left_closest_points,
3725 mj_scalar_t *current_global_right_closest_points,
3726 mj_scalar_t * current_cut_lower_bound_weights,
3727 mj_scalar_t * current_cut_upper_weights,
3728 mj_scalar_t *new_current_cut_coordinates,
3729 mj_scalar_t *current_part_cut_line_weight_to_put_left,
3730 mj_part_t *rectilinear_cut_count,
3731 mj_part_t &my_num_incomplete_cut){
3734 mj_scalar_t seen_weight_in_part = 0;
3736 mj_scalar_t expected_weight_in_part = 0;
3738 double imbalance_on_left = 0, imbalance_on_right = 0;
3741 #ifdef HAVE_ZOLTAN2_OMP
3744 for (mj_part_t i = 0; i < num_cuts; i++){
3747 if(min_coordinate - current_global_left_closest_points[i] > this->sEpsilon)
3748 current_global_left_closest_points[i] = current_cut_coordinates[i];
3749 if(current_global_right_closest_points[i] - max_coordinate > this->sEpsilon)
3750 current_global_right_closest_points[i] = current_cut_coordinates[i];
3753 #ifdef HAVE_ZOLTAN2_OMP
3756 for (mj_part_t i = 0; i < num_cuts; i++){
3758 if(this->distribute_points_on_cut_lines){
3760 this->global_rectilinear_cut_weight[i] = 0;
3761 this->process_rectilinear_cut_weight[i] = 0;
3765 if(current_cut_line_determined[i]) {
3766 new_current_cut_coordinates[i] = current_cut_coordinates[i];
3771 seen_weight_in_part = current_global_part_weights[i * 2];
3780 expected_weight_in_part = current_part_target_weights[i];
3782 imbalance_on_left =
imbalanceOf2(seen_weight_in_part, expected_weight_in_part);
3784 imbalance_on_right =
imbalanceOf2(global_total_weight - seen_weight_in_part, global_total_weight - expected_weight_in_part);
3786 bool is_left_imbalance_valid =
ZOLTAN2_ABS(imbalance_on_left) - used_imbalance_tolerance < this->sEpsilon ;
3787 bool is_right_imbalance_valid =
ZOLTAN2_ABS(imbalance_on_right) - used_imbalance_tolerance < this->sEpsilon;
3790 if(is_left_imbalance_valid && is_right_imbalance_valid){
3791 current_cut_line_determined[i] =
true;
3792 #ifdef HAVE_ZOLTAN2_OMP
3795 my_num_incomplete_cut -= 1;
3796 new_current_cut_coordinates [i] = current_cut_coordinates[i];
3799 else if(imbalance_on_left < 0){
3802 if(this->distribute_points_on_cut_lines){
3807 if (current_global_part_weights[i * 2 + 1] == expected_weight_in_part){
3809 current_cut_line_determined[i] =
true;
3810 #ifdef HAVE_ZOLTAN2_OMP
3813 my_num_incomplete_cut -= 1;
3816 new_current_cut_coordinates [i] = current_cut_coordinates[i];
3820 current_part_cut_line_weight_to_put_left[i] = current_local_part_weights[i * 2 + 1] - current_local_part_weights[i * 2];
3823 else if (current_global_part_weights[i * 2 + 1] > expected_weight_in_part){
3827 current_cut_line_determined[i] =
true;
3828 #ifdef HAVE_ZOLTAN2_OMP
3831 *rectilinear_cut_count += 1;
3834 #ifdef HAVE_ZOLTAN2_OMP
3837 my_num_incomplete_cut -= 1;
3838 new_current_cut_coordinates [i] = current_cut_coordinates[i];
3839 this->process_rectilinear_cut_weight[i] = current_local_part_weights[i * 2 + 1] -
3840 current_local_part_weights[i * 2];
3845 current_cut_lower_bounds[i] = current_global_right_closest_points[i];
3847 current_cut_lower_bound_weights[i] = seen_weight_in_part;
3851 for (mj_part_t ii = i + 1; ii < num_cuts ; ++ii){
3852 mj_scalar_t p_weight = current_global_part_weights[ii * 2];
3853 mj_scalar_t line_weight = current_global_part_weights[ii * 2 + 1];
3855 if(p_weight >= expected_weight_in_part){
3860 if(p_weight == expected_weight_in_part){
3861 current_cut_upper_bounds[i] = current_cut_coordinates[ii];
3862 current_cut_upper_weights[i] = p_weight;
3863 current_cut_lower_bounds[i] = current_cut_coordinates[ii];
3864 current_cut_lower_bound_weights[i] = p_weight;
3865 }
else if (p_weight < current_cut_upper_weights[i]){
3868 current_cut_upper_bounds[i] = current_global_left_closest_points[ii];
3869 current_cut_upper_weights[i] = p_weight;
3875 if(line_weight >= expected_weight_in_part){
3878 current_cut_upper_bounds[i] = current_cut_coordinates[ii];
3879 current_cut_upper_weights[i] = line_weight;
3880 current_cut_lower_bounds[i] = current_cut_coordinates[ii];
3881 current_cut_lower_bound_weights[i] = p_weight;
3886 if (p_weight <= expected_weight_in_part && p_weight >= current_cut_lower_bound_weights[i]){
3887 current_cut_lower_bounds[i] = current_global_right_closest_points[ii] ;
3888 current_cut_lower_bound_weights[i] = p_weight;
3893 mj_scalar_t new_cut_position = 0;
3894 this->mj_calculate_new_cut_position(
3895 current_cut_upper_bounds[i],
3896 current_cut_lower_bounds[i],
3897 current_cut_upper_weights[i],
3898 current_cut_lower_bound_weights[i],
3899 expected_weight_in_part, new_cut_position);
3903 if (
ZOLTAN2_ABS(current_cut_coordinates[i] - new_cut_position) < this->sEpsilon
3906 current_cut_line_determined[i] =
true;
3907 #ifdef HAVE_ZOLTAN2_OMP
3910 my_num_incomplete_cut -= 1;
3913 new_current_cut_coordinates [i] = current_cut_coordinates[i];
3915 new_current_cut_coordinates [i] = new_cut_position;
3921 current_cut_upper_bounds[i] = current_global_left_closest_points[i];
3922 current_cut_upper_weights[i] = seen_weight_in_part;
3925 for (
int ii = i - 1; ii >= 0; --ii){
3926 mj_scalar_t p_weight = current_global_part_weights[ii * 2];
3927 mj_scalar_t line_weight = current_global_part_weights[ii * 2 + 1];
3928 if(p_weight <= expected_weight_in_part){
3929 if(p_weight == expected_weight_in_part){
3932 current_cut_upper_bounds[i] = current_cut_coordinates[ii];
3933 current_cut_upper_weights[i] = p_weight;
3934 current_cut_lower_bounds[i] = current_cut_coordinates[ii];
3935 current_cut_lower_bound_weights[i] = p_weight;
3937 else if (p_weight > current_cut_lower_bound_weights[i]){
3940 current_cut_lower_bounds[i] = current_global_right_closest_points[ii];
3941 current_cut_lower_bound_weights[i] = p_weight;
3947 if(line_weight > expected_weight_in_part){
3948 current_cut_upper_bounds[i] = current_global_right_closest_points[ii];
3949 current_cut_upper_weights[i] = line_weight;
3958 if (p_weight >= expected_weight_in_part &&
3959 (p_weight < current_cut_upper_weights[i] ||
3960 (p_weight == current_cut_upper_weights[i] &&
3961 current_cut_upper_bounds[i] > current_global_left_closest_points[ii]
3965 current_cut_upper_bounds[i] = current_global_left_closest_points[ii] ;
3966 current_cut_upper_weights[i] = p_weight;
3969 mj_scalar_t new_cut_position = 0;
3970 this->mj_calculate_new_cut_position(
3971 current_cut_upper_bounds[i],
3972 current_cut_lower_bounds[i],
3973 current_cut_upper_weights[i],
3974 current_cut_lower_bound_weights[i],
3975 expected_weight_in_part,
3979 if (
ZOLTAN2_ABS(current_cut_coordinates[i] - new_cut_position) < this->sEpsilon
3981 current_cut_line_determined[i] =
true;
3982 #ifdef HAVE_ZOLTAN2_OMP
3985 my_num_incomplete_cut -= 1;
3987 new_current_cut_coordinates [ i] = current_cut_coordinates[i];
3989 new_current_cut_coordinates [ i] = new_cut_position;
3998 #ifdef HAVE_ZOLTAN2_OMP
4003 if(*rectilinear_cut_count > 0){
4006 Teuchos::scan<int,mj_scalar_t>(
4007 *comm, Teuchos::REDUCE_SUM,
4009 this->process_rectilinear_cut_weight,
4010 this->global_rectilinear_cut_weight
4015 for (mj_part_t i = 0; i < num_cuts; ++i){
4017 if(this->global_rectilinear_cut_weight[i] > 0) {
4019 mj_scalar_t expected_part_weight = current_part_target_weights[i];
4021 mj_scalar_t necessary_weight_on_line_for_left = expected_part_weight - current_global_part_weights[i * 2];
4023 mj_scalar_t my_weight_on_line = this->process_rectilinear_cut_weight[i];
4025 mj_scalar_t weight_on_line_upto_process_inclusive = this->global_rectilinear_cut_weight[i];
4028 mj_scalar_t space_to_put_left = necessary_weight_on_line_for_left - weight_on_line_upto_process_inclusive;
4030 mj_scalar_t space_left_to_me = space_to_put_left + my_weight_on_line;
4040 if(space_left_to_me < 0){
4042 current_part_cut_line_weight_to_put_left[i] = 0;
4044 else if(space_left_to_me >= my_weight_on_line){
4047 current_part_cut_line_weight_to_put_left[i] = my_weight_on_line;
4052 current_part_cut_line_weight_to_put_left[i] = space_left_to_me ;
4059 *rectilinear_cut_count = 0;
4074 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4076 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::get_processor_num_points_in_parts(
4077 mj_part_t num_procs,
4078 mj_part_t num_parts,
4079 mj_gno_t *&num_points_in_all_processor_parts){
4082 size_t allocation_size = num_parts * (num_procs + 1);
4089 mj_gno_t *num_local_points_in_each_part_to_reduce_sum = allocMemory<mj_gno_t>(allocation_size);
4094 mj_gno_t *my_local_points_to_reduce_sum = num_local_points_in_each_part_to_reduce_sum + num_procs * num_parts;
4097 mj_gno_t *my_local_point_counts_in_each_art = num_local_points_in_each_part_to_reduce_sum + this->myRank * num_parts;
4100 memset(num_local_points_in_each_part_to_reduce_sum, 0,
sizeof(mj_gno_t)*allocation_size);
4103 for (mj_part_t i = 0; i < num_parts; ++i){
4104 mj_lno_t part_begin_index = 0;
4106 part_begin_index = this->new_part_xadj[i - 1];
4108 mj_lno_t part_end_index = this->new_part_xadj[i];
4109 my_local_points_to_reduce_sum[i] = part_end_index - part_begin_index;
4114 memcpy (my_local_point_counts_in_each_art,
4115 my_local_points_to_reduce_sum,
4116 sizeof(mj_gno_t) * (num_parts) );
4124 reduceAll<int, mj_gno_t>(
4126 Teuchos::REDUCE_SUM,
4128 num_local_points_in_each_part_to_reduce_sum,
4129 num_points_in_all_processor_parts);
4132 freeArray<mj_gno_t>(num_local_points_in_each_part_to_reduce_sum);
4149 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4151 bool AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_check_to_migrate(
4152 size_t migration_reduce_all_population,
4153 mj_lno_t num_coords_for_last_dim_part,
4154 mj_part_t num_procs,
4155 mj_part_t num_parts,
4156 mj_gno_t *num_points_in_all_processor_parts){
4164 if (this->check_migrate_avoid_migration_option == 0){
4165 double global_imbalance = 0;
4167 size_t global_shift = num_procs * num_parts;
4169 for (mj_part_t ii = 0; ii < num_procs; ++ii){
4170 for (mj_part_t i = 0; i < num_parts; ++i){
4171 double ideal_num = num_points_in_all_processor_parts[global_shift + i]
4172 / double(num_procs);
4175 num_points_in_all_processor_parts[ii * num_parts + i]) / (ideal_num);
4178 global_imbalance /= num_parts;
4179 global_imbalance /= num_procs;
4187 if(global_imbalance <= this->minimum_migration_imbalance){
4210 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4212 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::assign_send_destinations(
4213 mj_part_t num_parts,
4214 mj_part_t *part_assignment_proc_begin_indices,
4215 mj_part_t *processor_chains_in_parts,
4216 mj_lno_t *send_count_to_each_proc,
4217 int *coordinate_destinations){
4219 for (mj_part_t p = 0; p < num_parts; ++p){
4220 mj_lno_t part_begin = 0;
4221 if (p > 0) part_begin = this->new_part_xadj[p - 1];
4222 mj_lno_t part_end = this->new_part_xadj[p];
4225 mj_part_t proc_to_sent = part_assignment_proc_begin_indices[p];
4227 mj_lno_t num_total_send = 0;
4228 for (mj_lno_t j=part_begin; j < part_end; j++){
4229 mj_lno_t local_ind = this->new_coordinate_permutations[j];
4230 while (num_total_send >= send_count_to_each_proc[proc_to_sent]){
4234 part_assignment_proc_begin_indices[p] = processor_chains_in_parts[proc_to_sent];
4236 processor_chains_in_parts[proc_to_sent] = -1;
4238 proc_to_sent = part_assignment_proc_begin_indices[p];
4241 coordinate_destinations[local_ind] = proc_to_sent;
4261 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4263 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_assign_proc_to_parts(
4264 mj_gno_t * num_points_in_all_processor_parts,
4265 mj_part_t num_parts,
4266 mj_part_t num_procs,
4267 mj_lno_t *send_count_to_each_proc,
4268 std::vector<mj_part_t> &processor_ranks_for_subcomm,
4269 std::vector<mj_part_t> *next_future_num_parts_in_parts,
4270 mj_part_t &out_part_index,
4271 mj_part_t &output_part_numbering_begin_index,
4272 int *coordinate_destinations){
4275 mj_gno_t *global_num_points_in_parts = num_points_in_all_processor_parts + num_procs * num_parts;
4276 mj_part_t *num_procs_assigned_to_each_part = allocMemory<mj_part_t>(num_parts);
4279 bool did_i_find_my_group =
false;
4281 mj_part_t num_free_procs = num_procs;
4282 mj_part_t minimum_num_procs_required_for_rest_of_parts = num_parts - 1;
4284 double max_imbalance_difference = 0;
4285 mj_part_t max_differing_part = 0;
4288 for (mj_part_t i=0; i < num_parts; i++){
4291 double scalar_required_proc = num_procs *
4292 (double (global_num_points_in_parts[i]) / double (this->num_global_coords));
4295 mj_part_t required_proc =
static_cast<mj_part_t
> (0.5 + scalar_required_proc);
4296 if (required_proc == 0) required_proc = 1;
4300 if (num_free_procs - required_proc < minimum_num_procs_required_for_rest_of_parts){
4301 required_proc = num_free_procs - (minimum_num_procs_required_for_rest_of_parts);
4305 num_free_procs -= required_proc;
4307 --minimum_num_procs_required_for_rest_of_parts;
4310 num_procs_assigned_to_each_part[i] = required_proc;
4315 double imbalance_wrt_ideal = (scalar_required_proc - required_proc) / required_proc;
4316 if (imbalance_wrt_ideal > max_imbalance_difference){
4317 max_imbalance_difference = imbalance_wrt_ideal;
4318 max_differing_part = i;
4323 if (num_free_procs > 0){
4324 num_procs_assigned_to_each_part[max_differing_part] += num_free_procs;
4331 mj_part_t *part_assignment_proc_begin_indices = allocMemory<mj_part_t>(num_parts);
4333 mj_part_t *processor_chains_in_parts = allocMemory<mj_part_t>(num_procs);
4334 mj_part_t *processor_part_assignments = allocMemory<mj_part_t>(num_procs);
4343 for (
int i = 0; i < num_procs; ++i ){
4344 processor_part_assignments[i] = -1;
4345 processor_chains_in_parts[i] = -1;
4347 for (
int i = 0; i < num_parts; ++i ){
4348 part_assignment_proc_begin_indices[i] = -1;
4354 uSignedSortItem<mj_part_t, mj_gno_t, char> * sort_item_num_part_points_in_procs = allocMemory <uSignedSortItem<mj_part_t, mj_gno_t, char> > (num_procs);
4355 for(mj_part_t i = 0; i < num_parts; ++i){
4359 for(mj_part_t ii = 0; ii < num_procs; ++ii){
4360 sort_item_num_part_points_in_procs[ii].id = ii;
4363 if (processor_part_assignments[ii] == -1){
4364 sort_item_num_part_points_in_procs[ii].val = num_points_in_all_processor_parts[ii * num_parts + i];
4365 sort_item_num_part_points_in_procs[ii].signbit = 1;
4375 sort_item_num_part_points_in_procs[ii].val = num_points_in_all_processor_parts[ii * num_parts + i];
4376 sort_item_num_part_points_in_procs[ii].signbit = 0;
4380 uqSignsort<mj_part_t, mj_gno_t,char>(num_procs, sort_item_num_part_points_in_procs);
4390 mj_part_t required_proc_count = num_procs_assigned_to_each_part[i];
4391 mj_gno_t total_num_points_in_part = global_num_points_in_parts[i];
4392 mj_gno_t ideal_num_points_in_a_proc =
4393 Teuchos::as<mj_gno_t>(ceil (total_num_points_in_part /
double (required_proc_count)));
4396 mj_part_t next_proc_to_send_index = num_procs - required_proc_count;
4397 mj_part_t next_proc_to_send_id = sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
4398 mj_lno_t space_left_in_sent_proc = ideal_num_points_in_a_proc - sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
4402 for(mj_part_t ii = num_procs - 1; ii >= num_procs - required_proc_count; --ii){
4403 mj_part_t proc_id = sort_item_num_part_points_in_procs[ii].id;
4405 processor_part_assignments[proc_id] = i;
4408 bool did_change_sign =
false;
4410 for(mj_part_t ii = 0; ii < num_procs; ++ii){
4413 if (sort_item_num_part_points_in_procs[ii].signbit == 0){
4414 did_change_sign =
true;
4415 sort_item_num_part_points_in_procs[ii].signbit = 1;
4421 if(did_change_sign){
4423 uqSignsort<mj_part_t, mj_gno_t>(num_procs - required_proc_count, sort_item_num_part_points_in_procs);
4435 if (!did_i_find_my_group){
4436 for(mj_part_t ii = num_procs - 1; ii >= num_procs - required_proc_count; --ii){
4438 mj_part_t proc_id_to_assign = sort_item_num_part_points_in_procs[ii].id;
4440 processor_ranks_for_subcomm.push_back(proc_id_to_assign);
4442 if(proc_id_to_assign == this->myRank){
4444 did_i_find_my_group =
true;
4446 part_assignment_proc_begin_indices[i] = this->myRank;
4447 processor_chains_in_parts[this->myRank] = -1;
4450 send_count_to_each_proc[this->myRank] = sort_item_num_part_points_in_procs[ii].val;
4453 for (mj_part_t in = 0; in < i; ++in){
4454 output_part_numbering_begin_index += (*next_future_num_parts_in_parts)[in];
4461 if (!did_i_find_my_group){
4462 processor_ranks_for_subcomm.clear();
4470 for(mj_part_t ii = num_procs - required_proc_count - 1; ii >= 0; --ii){
4471 mj_part_t nonassigned_proc_id = sort_item_num_part_points_in_procs[ii].id;
4472 mj_lno_t num_points_to_sent = sort_item_num_part_points_in_procs[ii].val;
4477 if (num_points_to_sent < 0) {
4478 std::cout <<
"Migration - processor assignments - for part:" << i <<
"from proc:" << nonassigned_proc_id <<
" num_points_to_sent:" << num_points_to_sent << std::endl;
4483 switch (migration_type){
4487 while (num_points_to_sent > 0){
4489 if (num_points_to_sent <= space_left_in_sent_proc){
4491 space_left_in_sent_proc -= num_points_to_sent;
4493 if (this->myRank == nonassigned_proc_id){
4495 send_count_to_each_proc[next_proc_to_send_id] = num_points_to_sent;
4498 mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
4499 part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
4500 processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
4502 num_points_to_sent = 0;
4506 if(space_left_in_sent_proc > 0){
4507 num_points_to_sent -= space_left_in_sent_proc;
4510 if (this->myRank == nonassigned_proc_id){
4512 send_count_to_each_proc[next_proc_to_send_id] = space_left_in_sent_proc;
4513 mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
4514 part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
4515 processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
4520 ++next_proc_to_send_index;
4523 if(next_part_to_send_index < nprocs - required_proc_count ){
4524 std::cout <<
"Migration - processor assignments - for part:"
4526 <<
" next_part_to_send :" << next_part_to_send_index
4527 <<
" nprocs:" << nprocs
4528 <<
" required_proc_count:" << required_proc_count
4529 <<
" Error: next_part_to_send_index < nprocs - required_proc_count" << std::endl;
4535 next_proc_to_send_id = sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
4537 space_left_in_sent_proc = ideal_num_points_in_a_proc - sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
4546 if (this->myRank == nonassigned_proc_id){
4548 send_count_to_each_proc[next_proc_to_send_id] = num_points_to_sent;
4551 mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
4552 part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
4553 processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
4555 num_points_to_sent = 0;
4556 ++next_proc_to_send_index;
4559 if (next_proc_to_send_index == num_procs){
4560 next_proc_to_send_index = num_procs - required_proc_count;
4563 next_proc_to_send_id = sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
4565 space_left_in_sent_proc = ideal_num_points_in_a_proc - sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
4578 this->assign_send_destinations(
4580 part_assignment_proc_begin_indices,
4581 processor_chains_in_parts,
4582 send_count_to_each_proc,
4583 coordinate_destinations);
4585 freeArray<mj_part_t>(part_assignment_proc_begin_indices);
4586 freeArray<mj_part_t>(processor_chains_in_parts);
4587 freeArray<mj_part_t>(processor_part_assignments);
4588 freeArray<uSignedSortItem<mj_part_t, mj_gno_t, char> > (sort_item_num_part_points_in_procs);
4589 freeArray<mj_part_t > (num_procs_assigned_to_each_part);
4606 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4608 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::assign_send_destinations2(
4609 mj_part_t num_parts,
4610 uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment,
4611 int *coordinate_destinations,
4612 mj_part_t &output_part_numbering_begin_index,
4613 std::vector<mj_part_t> *next_future_num_parts_in_parts){
4615 mj_part_t part_shift_amount = output_part_numbering_begin_index;
4616 mj_part_t previous_processor = -1;
4617 for(mj_part_t i = 0; i < num_parts; ++i){
4618 mj_part_t p = sort_item_part_to_proc_assignment[i].id;
4620 mj_lno_t part_begin_index = 0;
4621 if (p > 0) part_begin_index = this->new_part_xadj[p - 1];
4622 mj_lno_t part_end_index = this->new_part_xadj[p];
4624 mj_part_t assigned_proc = sort_item_part_to_proc_assignment[i].val;
4625 if (this->myRank == assigned_proc && previous_processor != assigned_proc){
4626 output_part_numbering_begin_index = part_shift_amount;
4628 previous_processor = assigned_proc;
4629 part_shift_amount += (*next_future_num_parts_in_parts)[p];
4631 for (mj_lno_t j=part_begin_index; j < part_end_index; j++){
4632 mj_lno_t localInd = this->new_coordinate_permutations[j];
4633 coordinate_destinations[localInd] = assigned_proc;
4655 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4657 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_assign_parts_to_procs(
4658 mj_gno_t * num_points_in_all_processor_parts,
4659 mj_part_t num_parts,
4660 mj_part_t num_procs,
4661 mj_lno_t *send_count_to_each_proc,
4662 std::vector<mj_part_t> *next_future_num_parts_in_parts,
4663 mj_part_t &out_num_part,
4664 std::vector<mj_part_t> &out_part_indices,
4665 mj_part_t &output_part_numbering_begin_index,
4666 int *coordinate_destinations){
4669 mj_gno_t *global_num_points_in_parts = num_points_in_all_processor_parts + num_procs * num_parts;
4670 out_part_indices.clear();
4674 uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment = allocMemory <uSortItem<mj_part_t, mj_part_t> >(num_parts);
4675 uSortItem<mj_part_t, mj_gno_t> * sort_item_num_points_of_proc_in_part_i = allocMemory <uSortItem<mj_part_t, mj_gno_t> >(num_procs);
4679 mj_lno_t work_each = mj_lno_t (this->num_global_coords / (
double (num_procs)) + 0.5f);
4681 mj_lno_t *space_in_each_processor = allocMemory <mj_lno_t>(num_procs);
4683 for (mj_part_t i = 0; i < num_procs; ++i){
4684 space_in_each_processor[i] = work_each;
4691 mj_part_t *num_parts_proc_assigned = allocMemory <mj_part_t>(num_procs);
4692 memset(num_parts_proc_assigned, 0,
sizeof(mj_part_t) * num_procs);
4693 int empty_proc_count = num_procs;
4697 uSortItem<mj_part_t, mj_gno_t> * sort_item_point_counts_in_parts = allocMemory <uSortItem<mj_part_t, mj_gno_t> >(num_parts);
4701 for (mj_part_t i = 0; i < num_parts; ++i){
4702 sort_item_point_counts_in_parts[i].id = i;
4703 sort_item_point_counts_in_parts[i].val = global_num_points_in_parts[i];
4706 uqsort<mj_part_t, mj_gno_t>(num_parts, sort_item_point_counts_in_parts);
4712 for (mj_part_t j = 0; j < num_parts; ++j){
4714 mj_part_t i = sort_item_point_counts_in_parts[num_parts - 1 - j].id;
4716 mj_gno_t load = global_num_points_in_parts[i];
4719 mj_part_t assigned_proc = -1;
4721 mj_part_t best_proc_to_assign = 0;
4725 for (mj_part_t ii = 0; ii < num_procs; ++ii){
4726 sort_item_num_points_of_proc_in_part_i[ii].id = ii;
4731 if (empty_proc_count < num_parts - j || num_parts_proc_assigned[ii] == 0){
4733 sort_item_num_points_of_proc_in_part_i[ii].val = num_points_in_all_processor_parts[ii * num_parts + i];
4736 sort_item_num_points_of_proc_in_part_i[ii].val = -1;
4739 uqsort<mj_part_t, mj_gno_t>(num_procs, sort_item_num_points_of_proc_in_part_i);
4742 for (mj_part_t iii = num_procs - 1; iii >= 0; --iii){
4743 mj_part_t ii = sort_item_num_points_of_proc_in_part_i[iii].id;
4744 mj_lno_t left_space = space_in_each_processor[ii] - load;
4746 if(left_space >= 0 ){
4751 if (space_in_each_processor[best_proc_to_assign] < space_in_each_processor[ii]){
4752 best_proc_to_assign = ii;
4757 if (assigned_proc == -1){
4758 assigned_proc = best_proc_to_assign;
4761 if (num_parts_proc_assigned[assigned_proc]++ == 0){
4764 space_in_each_processor[assigned_proc] -= load;
4766 sort_item_part_to_proc_assignment[j].id = i;
4767 sort_item_part_to_proc_assignment[j].val = assigned_proc;
4771 if (assigned_proc == this->myRank){
4773 out_part_indices.push_back(i);
4777 send_count_to_each_proc[assigned_proc] += num_points_in_all_processor_parts[this->myRank * num_parts + i];
4779 freeArray<mj_part_t>(num_parts_proc_assigned);
4780 freeArray< uSortItem<mj_part_t, mj_gno_t> > (sort_item_num_points_of_proc_in_part_i);
4781 freeArray<uSortItem<mj_part_t, mj_gno_t> >(sort_item_point_counts_in_parts);
4782 freeArray<mj_lno_t >(space_in_each_processor);
4786 uqsort<mj_part_t, mj_part_t>(num_parts, sort_item_part_to_proc_assignment);
4790 this->assign_send_destinations2(
4792 sort_item_part_to_proc_assignment,
4793 coordinate_destinations,
4794 output_part_numbering_begin_index,
4795 next_future_num_parts_in_parts);
4797 freeArray<uSortItem<mj_part_t, mj_part_t> >(sort_item_part_to_proc_assignment);
4818 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4820 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_migration_part_proc_assignment(
4821 mj_gno_t * num_points_in_all_processor_parts,
4822 mj_part_t num_parts,
4823 mj_part_t num_procs,
4824 mj_lno_t *send_count_to_each_proc,
4825 std::vector<mj_part_t> &processor_ranks_for_subcomm,
4826 std::vector<mj_part_t> *next_future_num_parts_in_parts,
4827 mj_part_t &out_num_part,
4828 std::vector<mj_part_t> &out_part_indices,
4829 mj_part_t &output_part_numbering_begin_index,
4830 int *coordinate_destinations){
4834 processor_ranks_for_subcomm.clear();
4836 if (num_procs > num_parts){
4841 mj_part_t out_part_index = 0;
4842 this->mj_assign_proc_to_parts(
4843 num_points_in_all_processor_parts,
4846 send_count_to_each_proc,
4847 processor_ranks_for_subcomm,
4848 next_future_num_parts_in_parts,
4850 output_part_numbering_begin_index,
4851 coordinate_destinations
4855 out_part_indices.clear();
4856 out_part_indices.push_back(out_part_index);
4863 processor_ranks_for_subcomm.push_back(this->myRank);
4867 this->mj_assign_parts_to_procs(
4868 num_points_in_all_processor_parts,
4871 send_count_to_each_proc,
4872 next_future_num_parts_in_parts,
4875 output_part_numbering_begin_index,
4876 coordinate_destinations);
4892 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4894 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_migrate_coords(
4895 mj_part_t num_procs,
4896 mj_lno_t &num_new_local_points,
4897 std::string iteration,
4898 int *coordinate_destinations,
4899 mj_part_t num_parts)
4901 #ifdef ENABLE_ZOLTAN_MIGRATION
4902 if (
sizeof(mj_lno_t) <=
sizeof(
int)) {
4908 ZOLTAN_COMM_OBJ *plan = NULL;
4909 MPI_Comm mpi_comm = Teuchos::getRawMpiComm(*(this->comm));
4910 int num_incoming_gnos = 0;
4911 int message_tag = 7859;
4913 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Migration Z1PlanCreating-" + iteration);
4914 int ierr = Zoltan_Comm_Create(
4916 int(this->num_local_coords),
4917 coordinate_destinations,
4920 &num_incoming_gnos);
4922 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Migration Z1PlanCreating-" + iteration);
4924 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Migration Z1Migration-" + iteration);
4925 mj_gno_t *incoming_gnos = allocMemory< mj_gno_t>(num_incoming_gnos);
4929 ierr = Zoltan_Comm_Do(
4932 (
char *) this->current_mj_gnos,
4934 (
char *) incoming_gnos);
4937 freeArray<mj_gno_t>(this->current_mj_gnos);
4938 this->current_mj_gnos = incoming_gnos;
4942 for (
int i = 0; i < this->coord_dim; ++i){
4944 mj_scalar_t *coord = this->mj_coordinates[i];
4946 this->mj_coordinates[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
4947 ierr = Zoltan_Comm_Do(
4951 sizeof(mj_scalar_t),
4952 (
char *) this->mj_coordinates[i]);
4954 freeArray<mj_scalar_t>(coord);
4958 for (
int i = 0; i < this->num_weights_per_coord; ++i){
4960 mj_scalar_t *weight = this->mj_weights[i];
4962 this->mj_weights[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
4963 ierr = Zoltan_Comm_Do(
4967 sizeof(mj_scalar_t),
4968 (
char *) this->mj_weights[i]);
4970 freeArray<mj_scalar_t>(weight);
4975 int *coord_own = allocMemory<int>(num_incoming_gnos);
4977 ierr = Zoltan_Comm_Do(
4980 (
char *) this->owner_of_coordinate,
4981 sizeof(
int), (
char *) coord_own);
4983 freeArray<int>(this->owner_of_coordinate);
4984 this->owner_of_coordinate = coord_own;
4990 mj_part_t *new_parts = allocMemory<mj_part_t>(num_incoming_gnos);
4991 if(num_procs < num_parts){
4993 ierr = Zoltan_Comm_Do(
4996 (
char *) this->assigned_part_ids,
4998 (
char *) new_parts);
5001 freeArray<mj_part_t>(this->assigned_part_ids);
5002 this->assigned_part_ids = new_parts;
5004 ierr = Zoltan_Comm_Destroy(&plan);
5006 num_new_local_points = num_incoming_gnos;
5007 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Migration Z1Migration-" + iteration);
5012 #endif // ENABLE_ZOLTAN_MIGRATION
5014 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Migration DistributorPlanCreating-" + iteration);
5015 Tpetra::Distributor distributor(this->comm);
5016 ArrayView<const mj_part_t> destinations( coordinate_destinations, this->num_local_coords);
5017 mj_lno_t num_incoming_gnos = distributor.createFromSends(destinations);
5018 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Migration DistributorPlanCreating-" + iteration);
5020 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Migration DistributorMigration-" + iteration);
5023 ArrayRCP<mj_gno_t> received_gnos(num_incoming_gnos);
5024 ArrayView<mj_gno_t> sent_gnos(this->current_mj_gnos, this->num_local_coords);
5025 distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
5026 freeArray<mj_gno_t>(this->current_mj_gnos);
5027 this->current_mj_gnos = allocMemory<mj_gno_t>(num_incoming_gnos);
5029 this->current_mj_gnos,
5030 received_gnos.getRawPtr(),
5031 num_incoming_gnos *
sizeof(mj_gno_t));
5034 for (
int i = 0; i < this->coord_dim; ++i){
5036 ArrayView<mj_scalar_t> sent_coord(this->mj_coordinates[i], this->num_local_coords);
5037 ArrayRCP<mj_scalar_t> received_coord(num_incoming_gnos);
5038 distributor.doPostsAndWaits<mj_scalar_t>(sent_coord, 1, received_coord());
5039 freeArray<mj_scalar_t>(this->mj_coordinates[i]);
5040 this->mj_coordinates[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
5042 this->mj_coordinates[i],
5043 received_coord.getRawPtr(),
5044 num_incoming_gnos *
sizeof(mj_scalar_t));
5048 for (
int i = 0; i < this->num_weights_per_coord; ++i){
5050 ArrayView<mj_scalar_t> sent_weight(this->mj_weights[i], this->num_local_coords);
5051 ArrayRCP<mj_scalar_t> received_weight(num_incoming_gnos);
5052 distributor.doPostsAndWaits<mj_scalar_t>(sent_weight, 1, received_weight());
5053 freeArray<mj_scalar_t>(this->mj_weights[i]);
5054 this->mj_weights[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
5056 this->mj_weights[i],
5057 received_weight.getRawPtr(),
5058 num_incoming_gnos *
sizeof(mj_scalar_t));
5063 ArrayView<int> sent_owners(this->owner_of_coordinate, this->num_local_coords);
5064 ArrayRCP<int> received_owners(num_incoming_gnos);
5065 distributor.doPostsAndWaits<
int>(sent_owners, 1, received_owners());
5066 freeArray<int>(this->owner_of_coordinate);
5067 this->owner_of_coordinate = allocMemory<int>(num_incoming_gnos);
5069 this->owner_of_coordinate,
5070 received_owners.getRawPtr(),
5071 num_incoming_gnos *
sizeof(int));
5077 if(num_procs < num_parts){
5078 ArrayView<mj_part_t> sent_partids(this->assigned_part_ids, this->num_local_coords);
5079 ArrayRCP<mj_part_t> received_partids(num_incoming_gnos);
5080 distributor.doPostsAndWaits<mj_part_t>(sent_partids, 1, received_partids());
5081 freeArray<mj_part_t>(this->assigned_part_ids);
5082 this->assigned_part_ids = allocMemory<mj_part_t>(num_incoming_gnos);
5084 this->assigned_part_ids,
5085 received_partids.getRawPtr(),
5086 num_incoming_gnos *
sizeof(mj_part_t));
5089 mj_part_t *new_parts = allocMemory<int>(num_incoming_gnos);
5090 freeArray<mj_part_t>(this->assigned_part_ids);
5091 this->assigned_part_ids = new_parts;
5093 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Migration DistributorMigration-" + iteration);
5094 num_new_local_points = num_incoming_gnos;
5105 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5107 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::create_sub_communicator(std::vector<mj_part_t> &processor_ranks_for_subcomm){
5108 mj_part_t group_size = processor_ranks_for_subcomm.size();
5109 mj_part_t *ids = allocMemory<mj_part_t>(group_size);
5110 for(mj_part_t i = 0; i < group_size; ++i) {
5111 ids[i] = processor_ranks_for_subcomm[i];
5113 ArrayView<const mj_part_t> idView(ids, group_size);
5114 this->comm = this->comm->createSubcommunicator(idView);
5115 freeArray<mj_part_t>(ids);
5124 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5126 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::fill_permutation_array(
5127 mj_part_t output_num_parts,
5128 mj_part_t num_parts){
5130 if (output_num_parts == 1){
5131 for(mj_lno_t i = 0; i < this->num_local_coords; ++i){
5132 this->new_coordinate_permutations[i] = i;
5134 this->new_part_xadj[0] = this->num_local_coords;
5141 mj_lno_t *num_points_in_parts = allocMemory<mj_lno_t>(num_parts);
5143 mj_part_t *part_shifts = allocMemory<mj_part_t>(num_parts);
5145 memset(num_points_in_parts, 0,
sizeof(mj_lno_t) * num_parts);
5147 for(mj_lno_t i = 0; i < this->num_local_coords; ++i){
5148 mj_part_t ii = this->assigned_part_ids[i];
5149 ++num_points_in_parts[ii];
5154 mj_lno_t prev_index = 0;
5155 for(mj_part_t i = 0; i < num_parts; ++i){
5156 if(num_points_in_parts[i] > 0) {
5157 this->new_part_xadj[p] = prev_index + num_points_in_parts[i];
5158 prev_index += num_points_in_parts[i];
5159 part_shifts[i] = p++;
5164 mj_part_t assigned_num_parts = p - 1;
5165 for (;p < num_parts; ++p){
5166 this->new_part_xadj[p] = this->new_part_xadj[assigned_num_parts];
5168 for(mj_part_t i = 0; i < output_num_parts; ++i){
5169 num_points_in_parts[i] = this->new_part_xadj[i];
5175 for(mj_lno_t i = this->num_local_coords - 1; i >= 0; --i){
5176 mj_part_t part = part_shifts[mj_part_t(this->assigned_part_ids[i])];
5177 this->new_coordinate_permutations[--num_points_in_parts[part]] = i;
5180 freeArray<mj_lno_t>(num_points_in_parts);
5181 freeArray<mj_part_t>(part_shifts);
5208 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5210 bool AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_perform_migration(
5211 mj_part_t input_num_parts,
5212 mj_part_t &output_num_parts,
5213 std::vector<mj_part_t> *next_future_num_parts_in_parts,
5214 mj_part_t &output_part_begin_index,
5215 size_t migration_reduce_all_population,
5216 mj_lno_t num_coords_for_last_dim_part,
5217 std::string iteration,
5218 RCP<mj_partBoxVector_t> &input_part_boxes,
5219 RCP<mj_partBoxVector_t> &output_part_boxes
5222 mj_part_t num_procs = this->comm->getSize();
5223 this->myRank = this->comm->getRank();
5229 mj_gno_t *num_points_in_all_processor_parts = allocMemory<mj_gno_t>(input_num_parts * (num_procs + 1));
5232 this->get_processor_num_points_in_parts(
5235 num_points_in_all_processor_parts);
5239 if (!this->mj_check_to_migrate(
5240 migration_reduce_all_population,
5241 num_coords_for_last_dim_part,
5244 num_points_in_all_processor_parts)){
5245 freeArray<mj_gno_t>(num_points_in_all_processor_parts);
5250 mj_lno_t *send_count_to_each_proc = NULL;
5251 int *coordinate_destinations = allocMemory<int>(this->num_local_coords);
5252 send_count_to_each_proc = allocMemory<mj_lno_t>(num_procs);
5253 for (
int i = 0; i < num_procs; ++i) send_count_to_each_proc[i] = 0;
5255 std::vector<mj_part_t> processor_ranks_for_subcomm;
5256 std::vector<mj_part_t> out_part_indices;
5259 this->mj_migration_part_proc_assignment(
5260 num_points_in_all_processor_parts,
5263 send_count_to_each_proc,
5264 processor_ranks_for_subcomm,
5265 next_future_num_parts_in_parts,
5268 output_part_begin_index,
5269 coordinate_destinations);
5274 freeArray<mj_lno_t>(send_count_to_each_proc);
5275 std::vector <mj_part_t> tmpv;
5277 std::sort (out_part_indices.begin(), out_part_indices.end());
5278 mj_part_t outP = out_part_indices.size();
5280 mj_gno_t new_global_num_points = 0;
5281 mj_gno_t *global_num_points_in_parts = num_points_in_all_processor_parts + num_procs * input_num_parts;
5283 if (this->mj_keep_part_boxes){
5284 input_part_boxes->clear();
5289 for (mj_part_t i = 0; i < outP; ++i){
5290 mj_part_t ind = out_part_indices[i];
5291 new_global_num_points += global_num_points_in_parts[ind];
5292 tmpv.push_back((*next_future_num_parts_in_parts)[ind]);
5293 if (this->mj_keep_part_boxes){
5294 input_part_boxes->push_back((*output_part_boxes)[ind]);
5298 if (this->mj_keep_part_boxes){
5299 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
5300 input_part_boxes = output_part_boxes;
5301 output_part_boxes = tmpPartBoxes;
5303 next_future_num_parts_in_parts->clear();
5304 for (mj_part_t i = 0; i < outP; ++i){
5305 mj_part_t p = tmpv[i];
5306 next_future_num_parts_in_parts->push_back(p);
5309 freeArray<mj_gno_t>(num_points_in_all_processor_parts);
5311 mj_lno_t num_new_local_points = 0;
5315 this->mj_migrate_coords(
5317 num_new_local_points,
5319 coordinate_destinations,
5323 freeArray<int>(coordinate_destinations);
5325 if(this->num_local_coords != num_new_local_points){
5326 freeArray<mj_lno_t>(this->new_coordinate_permutations);
5327 freeArray<mj_lno_t>(this->coordinate_permutations);
5329 this->new_coordinate_permutations = allocMemory<mj_lno_t>(num_new_local_points);
5330 this->coordinate_permutations = allocMemory<mj_lno_t>(num_new_local_points);
5332 this->num_local_coords = num_new_local_points;
5333 this->num_global_coords = new_global_num_points;
5338 this->create_sub_communicator(processor_ranks_for_subcomm);
5339 processor_ranks_for_subcomm.clear();
5342 this->fill_permutation_array(
5362 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5364 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::create_consistent_chunks(
5365 mj_part_t num_parts,
5366 mj_scalar_t *mj_current_dim_coords,
5367 mj_scalar_t *current_concurrent_cut_coordinate,
5368 mj_lno_t coordinate_begin,
5369 mj_lno_t coordinate_end,
5370 mj_scalar_t *used_local_cut_line_weight_to_left,
5371 mj_lno_t *out_part_xadj,
5372 int coordInd,
bool longest_dim_part, uSignedSortItem<int, mj_scalar_t, char> *p_coord_dimension_range_sorted){
5375 mj_part_t no_cuts = num_parts - 1;
5380 mj_lno_t *thread_num_points_in_parts = this->thread_point_counts[me];
5381 mj_scalar_t *my_local_thread_cut_weights_to_put_left = NULL;
5386 if (this->distribute_points_on_cut_lines){
5388 my_local_thread_cut_weights_to_put_left = this->thread_cut_line_weight_to_put_left[me];
5389 for (mj_part_t i = 0; i < no_cuts; ++i){
5391 mj_scalar_t left_weight = used_local_cut_line_weight_to_left[i];
5393 for(
int ii = 0; ii < this->num_threads; ++ii){
5394 if(left_weight > this->sEpsilon){
5396 mj_scalar_t thread_ii_weight_on_cut = this->thread_part_weight_work[ii][i * 2 + 1] - this->thread_part_weight_work[ii][i * 2 ];
5397 if(thread_ii_weight_on_cut < left_weight){
5398 this->thread_cut_line_weight_to_put_left[ii][i] = thread_ii_weight_on_cut;
5401 this->thread_cut_line_weight_to_put_left[ii][i] = left_weight ;
5403 left_weight -= thread_ii_weight_on_cut;
5406 this->thread_cut_line_weight_to_put_left[ii][i] = 0;
5414 for (mj_part_t i = no_cuts - 1; i > 0 ; --i){
5415 if(
ZOLTAN2_ABS(current_concurrent_cut_coordinate[i] - current_concurrent_cut_coordinate[i -1]) < this->sEpsilon){
5416 my_local_thread_cut_weights_to_put_left[i] -= my_local_thread_cut_weights_to_put_left[i - 1] ;
5424 for(mj_part_t ii = 0; ii < num_parts; ++ii){
5425 thread_num_points_in_parts[ii] = 0;
5437 mj_part_t *cut_map = allocMemory<mj_part_t> (no_cuts);
5440 typedef uMultiSortItem<mj_lno_t, int, mj_scalar_t> multiSItem;
5441 typedef std::vector< multiSItem > multiSVector;
5442 typedef std::vector<multiSVector> multiS2Vector;
5445 std::vector<mj_scalar_t *>allocated_memory;
5448 multiS2Vector sort_vector_points_on_cut;
5451 mj_part_t different_cut_count = 1;
5456 multiSVector tmpMultiSVector;
5457 sort_vector_points_on_cut.push_back(tmpMultiSVector);
5459 for (mj_part_t i = 1; i < no_cuts ; ++i){
5462 if(
ZOLTAN2_ABS(current_concurrent_cut_coordinate[i] - current_concurrent_cut_coordinate[i -1]) < this->sEpsilon){
5463 cut_map[i] = cut_map[i-1];
5466 cut_map[i] = different_cut_count++;
5467 multiSVector tmp2MultiSVector;
5468 sort_vector_points_on_cut.push_back(tmp2MultiSVector);
5474 for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
5476 mj_lno_t i = this->coordinate_permutations[ii];
5478 mj_part_t pp = this->assigned_part_ids[i];
5479 mj_part_t p = pp / 2;
5482 mj_scalar_t *
vals = allocMemory<mj_scalar_t>(this->coord_dim -1);
5483 allocated_memory.push_back(vals);
5488 if (longest_dim_part){
5490 for(
int dim = this->coord_dim - 2; dim >= 0; --dim){
5492 int next_largest_coord_dim = p_coord_dimension_range_sorted[dim].id;
5494 vals[val_ind++] = this->mj_coordinates[next_largest_coord_dim][i];
5498 for(
int dim = coordInd + 1; dim < this->coord_dim; ++dim){
5499 vals[val_ind++] = this->mj_coordinates[dim][i];
5501 for(
int dim = 0; dim < coordInd; ++dim){
5502 vals[val_ind++] = this->mj_coordinates[dim][i];
5505 multiSItem tempSortItem(i, this->coord_dim -1, vals);
5507 mj_part_t cmap = cut_map[p];
5508 sort_vector_points_on_cut[cmap].push_back(tempSortItem);
5512 ++thread_num_points_in_parts[p];
5513 this->assigned_part_ids[i] = p;
5518 for (mj_part_t i = 0; i < different_cut_count; ++i){
5519 std::sort (sort_vector_points_on_cut[i].begin(), sort_vector_points_on_cut[i].end());
5523 mj_part_t previous_cut_map = cut_map[0];
5532 mj_scalar_t weight_stolen_from_previous_part = 0;
5533 for (mj_part_t p = 0; p < no_cuts; ++p){
5535 mj_part_t mapped_cut = cut_map[p];
5539 if (previous_cut_map != mapped_cut){
5540 mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[previous_cut_map].size() - 1;
5541 for (; sort_vector_end >= 0; --sort_vector_end){
5542 multiSItem t = sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
5543 mj_lno_t i = t.index;
5544 ++thread_num_points_in_parts[p];
5545 this->assigned_part_ids[i] = p;
5547 sort_vector_points_on_cut[previous_cut_map].clear();
5551 mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[mapped_cut].size() - 1;
5556 for (; sort_vector_end >= 0; --sort_vector_end){
5559 multiSItem t = sort_vector_points_on_cut[mapped_cut][sort_vector_end];
5561 mj_lno_t i = t.index;
5562 mj_scalar_t w = this->mj_uniform_weights[0]? 1:this->mj_weights[0][i];
5566 if( my_local_thread_cut_weights_to_put_left[p] + weight_stolen_from_previous_part> this->sEpsilon &&
5567 my_local_thread_cut_weights_to_put_left[p] + weight_stolen_from_previous_part -
ZOLTAN2_ABS(my_local_thread_cut_weights_to_put_left[p] + weight_stolen_from_previous_part - w)
5570 my_local_thread_cut_weights_to_put_left[p] -= w;
5571 sort_vector_points_on_cut[mapped_cut].pop_back();
5572 ++thread_num_points_in_parts[p];
5573 this->assigned_part_ids[i] = p;
5576 if(p < no_cuts - 1 && my_local_thread_cut_weights_to_put_left[p] < this->sEpsilon){
5577 if(mapped_cut == cut_map[p + 1] ){
5580 if (previous_cut_map != mapped_cut){
5581 weight_stolen_from_previous_part = my_local_thread_cut_weights_to_put_left[p];
5586 weight_stolen_from_previous_part += my_local_thread_cut_weights_to_put_left[p];
5590 weight_stolen_from_previous_part = -my_local_thread_cut_weights_to_put_left[p];
5599 if(p < no_cuts - 1 && mapped_cut == cut_map[p + 1]){
5600 if (previous_cut_map != mapped_cut){
5601 weight_stolen_from_previous_part = my_local_thread_cut_weights_to_put_left[p];
5604 weight_stolen_from_previous_part += my_local_thread_cut_weights_to_put_left[p];
5608 weight_stolen_from_previous_part = -my_local_thread_cut_weights_to_put_left[p];
5614 previous_cut_map = mapped_cut;
5619 mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[previous_cut_map].size() - 1;
5624 for (; sort_vector_end >= 0; --sort_vector_end){
5627 multiSItem t = sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
5629 mj_lno_t i = t.index;
5630 ++thread_num_points_in_parts[no_cuts];
5631 this->assigned_part_ids[i] = no_cuts;
5633 sort_vector_points_on_cut[previous_cut_map].clear();
5634 freeArray<mj_part_t> (cut_map);
5637 mj_lno_t vSize = (mj_lno_t) allocated_memory.size();
5638 for(mj_lno_t i = 0; i < vSize; ++i){
5639 freeArray<mj_scalar_t> (allocated_memory[i]);
5643 for(mj_part_t j = 0; j < num_parts; ++j){
5644 mj_lno_t num_points_in_part_j_upto_thread_i = 0;
5645 for (
int i = 0; i < this->num_threads; ++i){
5646 mj_lno_t thread_num_points_in_part_j = this->thread_point_counts[i][j];
5647 this->thread_point_counts[i][j] = num_points_in_part_j_upto_thread_i;
5648 num_points_in_part_j_upto_thread_i += thread_num_points_in_part_j;
5651 out_part_xadj[j] = num_points_in_part_j_upto_thread_i;
5655 for(mj_part_t j = 1; j < num_parts; ++j){
5656 out_part_xadj[j] += out_part_xadj[j - 1];
5662 for(mj_part_t j = 1; j < num_parts; ++j){
5663 thread_num_points_in_parts[j] += out_part_xadj[j - 1] ;
5668 for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
5669 mj_lno_t i = this->coordinate_permutations[ii];
5670 mj_part_t p = this->assigned_part_ids[i];
5671 this->new_coordinate_permutations[coordinate_begin +
5672 thread_num_points_in_parts[p]++] = i;
5687 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5689 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::set_final_parts(
5690 mj_part_t current_num_parts,
5691 mj_part_t output_part_begin_index,
5692 RCP<mj_partBoxVector_t> &output_part_boxes,
5693 bool is_data_ever_migrated)
5695 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Part_Assignment");
5697 #ifdef HAVE_ZOLTAN2_OMP
5698 #pragma omp parallel for
5700 for(mj_part_t i = 0; i < current_num_parts;++i){
5703 mj_lno_t end = this->part_xadj[i];
5705 if(i > 0) begin = this->part_xadj[i -1];
5706 mj_part_t part_to_set_index = i + output_part_begin_index;
5707 if (this->mj_keep_part_boxes){
5708 (*output_part_boxes)[i].setpId(part_to_set_index);
5710 for (mj_lno_t ii = begin; ii < end; ++ii){
5711 mj_lno_t k = this->coordinate_permutations[ii];
5712 this->assigned_part_ids[k] = part_to_set_index;
5717 if(!is_data_ever_migrated){
5724 #ifdef ENABLE_ZOLTAN_MIGRATION
5725 if (
sizeof(mj_lno_t) <=
sizeof(
int)) {
5732 ZOLTAN_COMM_OBJ *plan = NULL;
5733 MPI_Comm mpi_comm = Teuchos::getRawMpiComm(*(this->mj_problemComm));
5736 int message_tag = 7856;
5738 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Final Z1PlanCreating");
5739 int ierr = Zoltan_Comm_Create( &plan,
int(this->num_local_coords),
5740 this->owner_of_coordinate, mpi_comm, message_tag,
5743 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Final Z1PlanCreating" );
5745 mj_gno_t *incoming_gnos = allocMemory< mj_gno_t>(incoming);
5748 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Final Z1PlanComm");
5749 ierr = Zoltan_Comm_Do( plan, message_tag, (
char *) this->current_mj_gnos,
5750 sizeof(mj_gno_t), (
char *) incoming_gnos);
5753 freeArray<mj_gno_t>(this->current_mj_gnos);
5754 this->current_mj_gnos = incoming_gnos;
5756 mj_part_t *incoming_partIds = allocMemory< mj_part_t>(incoming);
5759 ierr = Zoltan_Comm_Do( plan, message_tag, (
char *) this->assigned_part_ids,
5760 sizeof(mj_part_t), (
char *) incoming_partIds);
5762 freeArray<mj_part_t>(this->assigned_part_ids);
5763 this->assigned_part_ids = incoming_partIds;
5765 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Final Z1PlanComm");
5766 ierr = Zoltan_Comm_Destroy(&plan);
5769 this->num_local_coords = incoming;
5774 #endif // !ENABLE_ZOLTAN_MIGRATION
5777 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Final DistributorPlanCreating");
5778 Tpetra::Distributor distributor(this->mj_problemComm);
5779 ArrayView<const mj_part_t> owners_of_coords(this->owner_of_coordinate, this->num_local_coords);
5780 mj_lno_t incoming = distributor.createFromSends(owners_of_coords);
5781 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Final DistributorPlanCreating" );
5783 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Final DistributorPlanComm");
5785 ArrayRCP<mj_gno_t> received_gnos(incoming);
5786 ArrayView<mj_gno_t> sent_gnos(this->current_mj_gnos, this->num_local_coords);
5787 distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
5788 freeArray<mj_gno_t>(this->current_mj_gnos);
5789 this->current_mj_gnos = allocMemory<mj_gno_t>(incoming);
5790 memcpy( this->current_mj_gnos,
5791 received_gnos.getRawPtr(),
5792 incoming *
sizeof(mj_gno_t));
5795 ArrayView<mj_part_t> sent_partids(this->assigned_part_ids, this->num_local_coords);
5796 ArrayRCP<mj_part_t> received_partids(incoming);
5797 distributor.doPostsAndWaits<mj_part_t>(sent_partids, 1, received_partids());
5798 freeArray<mj_part_t>(this->assigned_part_ids);
5799 this->assigned_part_ids = allocMemory<mj_part_t>(incoming);
5800 memcpy( this->assigned_part_ids,
5801 received_partids.getRawPtr(),
5802 incoming *
sizeof(mj_part_t));
5804 this->num_local_coords = incoming;
5805 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Final DistributorPlanComm");
5810 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Part_Assignment");
5812 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Solution_Part_Assignment");
5817 if (this->mj_keep_part_boxes){
5818 this->kept_boxes = compute_global_box_boundaries(output_part_boxes);
5822 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Solution_Part_Assignment");
5827 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5829 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::free_work_memory(){
5830 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Problem_Free");
5832 for (
int i=0; i < this->coord_dim; i++){
5833 freeArray<mj_scalar_t>(this->mj_coordinates[i]);
5835 freeArray<mj_scalar_t *>(this->mj_coordinates);
5837 for (
int i=0; i < this->num_weights_per_coord; i++){
5838 freeArray<mj_scalar_t>(this->mj_weights[i]);
5840 freeArray<mj_scalar_t *>(this->mj_weights);
5842 freeArray<int>(this->owner_of_coordinate);
5844 for(
int i = 0; i < this->num_threads; ++i){
5845 freeArray<mj_lno_t>(this->thread_point_counts[i]);
5848 freeArray<mj_lno_t *>(this->thread_point_counts);
5849 freeArray<double *> (this->thread_part_weight_work);
5851 if(this->distribute_points_on_cut_lines){
5852 freeArray<mj_scalar_t>(this->process_cut_line_weight_to_put_left);
5853 for(
int i = 0; i < this->num_threads; ++i){
5854 freeArray<mj_scalar_t>(this->thread_cut_line_weight_to_put_left[i]);
5856 freeArray<mj_scalar_t *>(this->thread_cut_line_weight_to_put_left);
5857 freeArray<mj_scalar_t>(this->process_rectilinear_cut_weight);
5858 freeArray<mj_scalar_t>(this->global_rectilinear_cut_weight);
5861 freeArray<mj_part_t>(this->my_incomplete_cut_count);
5863 freeArray<mj_scalar_t>(this->max_min_coords);
5865 freeArray<mj_lno_t>(this->part_xadj);
5867 freeArray<mj_lno_t>(this->coordinate_permutations);
5869 freeArray<mj_lno_t>(this->new_coordinate_permutations);
5871 freeArray<mj_scalar_t>(this->all_cut_coordinates);
5873 freeArray<mj_scalar_t> (this->process_local_min_max_coord_total_weight);
5875 freeArray<mj_scalar_t> (this->global_min_max_coord_total_weight);
5877 freeArray<mj_scalar_t>(this->cut_coordinates_work_array);
5879 freeArray<mj_scalar_t>(this->target_part_weights);
5881 freeArray<mj_scalar_t>(this->cut_upper_bound_coordinates);
5883 freeArray<mj_scalar_t>(this->cut_lower_bound_coordinates);
5885 freeArray<mj_scalar_t>(this->cut_lower_bound_weights);
5886 freeArray<mj_scalar_t>(this->cut_upper_bound_weights);
5887 freeArray<bool>(this->is_cut_line_determined);
5888 freeArray<mj_scalar_t>(this->total_part_weight_left_right_closests);
5889 freeArray<mj_scalar_t>(this->global_total_part_weight_left_right_closests);
5891 for(
int i = 0; i < this->num_threads; ++i){
5892 freeArray<double>(this->thread_part_weights[i]);
5893 freeArray<mj_scalar_t>(this->thread_cut_right_closest_point[i]);
5894 freeArray<mj_scalar_t>(this->thread_cut_left_closest_point[i]);
5897 freeArray<double *>(this->thread_part_weights);
5898 freeArray<mj_scalar_t *>(this->thread_cut_left_closest_point);
5899 freeArray<mj_scalar_t *>(this->thread_cut_right_closest_point);
5901 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Problem_Free");
5912 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5915 bool distribute_points_on_cut_lines_,
5916 int max_concurrent_part_calculation_,
5917 int check_migrate_avoid_migration_option_,
5918 double minimum_migration_imbalance_,
5919 int migration_type_ ){
5920 this->distribute_points_on_cut_lines = distribute_points_on_cut_lines_;
5921 this->max_concurrent_part_calculation = max_concurrent_part_calculation_;
5922 this->check_migrate_avoid_migration_option = check_migrate_avoid_migration_option_;
5923 this->minimum_migration_imbalance = minimum_migration_imbalance_;
5924 this->migration_type = migration_type_;
5959 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5963 const RCP<const Environment> &env,
5964 RCP<
const Comm<int> > &problemComm,
5966 double imbalance_tolerance_,
5967 size_t num_global_parts_,
5968 mj_part_t *part_no_array_,
5969 int recursion_depth_,
5972 mj_lno_t num_local_coords_,
5973 mj_gno_t num_global_coords_,
5974 const mj_gno_t *initial_mj_gnos_,
5975 mj_scalar_t **mj_coordinates_,
5977 int num_weights_per_coord_,
5978 bool *mj_uniform_weights_,
5979 mj_scalar_t **mj_weights_,
5980 bool *mj_uniform_parts_,
5981 mj_scalar_t **mj_part_sizes_,
5983 mj_part_t *&result_assigned_part_ids_,
5984 mj_gno_t *&result_mj_gnos_
5991 if(comm->getRank() == 0){
5992 std::cout <<
"size of gno:" <<
sizeof(mj_gno_t) << std::endl;
5993 std::cout <<
"size of lno:" <<
sizeof(mj_lno_t) << std::endl;
5994 std::cout <<
"size of mj_scalar_t:" <<
sizeof(mj_scalar_t) << std::endl;
5998 this->mj_problemComm = problemComm;
5999 this->myActualRank = this->myRank = this->mj_problemComm->getRank();
6021 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Total");
6022 this->mj_env->debug(3,
"In MultiJagged Jagged");
6025 this->imbalance_tolerance = imbalance_tolerance_;
6026 this->num_global_parts = num_global_parts_;
6027 this->part_no_array = part_no_array_;
6028 this->recursion_depth = recursion_depth_;
6030 this->coord_dim = coord_dim_;
6031 this->num_local_coords = num_local_coords_;
6032 this->num_global_coords = num_global_coords_;
6033 this->mj_coordinates = mj_coordinates_;
6034 this->initial_mj_gnos = (mj_gno_t *) initial_mj_gnos_;
6036 this->num_weights_per_coord = num_weights_per_coord_;
6037 this->mj_uniform_weights = mj_uniform_weights_;
6038 this->mj_weights = mj_weights_;
6039 this->mj_uniform_parts = mj_uniform_parts_;
6040 this->mj_part_sizes = mj_part_sizes_;
6042 this->num_threads = 1;
6043 #ifdef HAVE_ZOLTAN2_OMP
6044 #pragma omp parallel
6047 this->num_threads = omp_get_num_threads();
6052 this->set_part_specifications();
6054 this->allocate_set_work_memory();
6058 this->comm = this->mj_problemComm->duplicate();
6061 mj_part_t current_num_parts = 1;
6062 mj_scalar_t *current_cut_coordinates = this->all_cut_coordinates;
6064 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Problem_Partitioning");
6066 mj_part_t output_part_begin_index = 0;
6067 mj_part_t future_num_parts = this->total_num_part;
6068 bool is_data_ever_migrated =
false;
6070 std::vector<mj_part_t> *future_num_part_in_parts =
new std::vector<mj_part_t> ();
6071 std::vector<mj_part_t> *next_future_num_parts_in_parts =
new std::vector<mj_part_t> ();
6072 next_future_num_parts_in_parts->push_back(this->num_global_parts);
6074 RCP<mj_partBoxVector_t> input_part_boxes(
new mj_partBoxVector_t(),
true) ;
6075 RCP<mj_partBoxVector_t> output_part_boxes(
new mj_partBoxVector_t(),
true);
6077 compute_global_box();
6078 if(this->mj_keep_part_boxes){
6079 this->init_part_boxes(output_part_boxes);
6082 for (
int i = 0; i < this->recursion_depth; ++i){
6085 std::vector <mj_part_t> num_partitioning_in_current_dim;
6099 std::vector<mj_part_t> *tmpPartVect= future_num_part_in_parts;
6100 future_num_part_in_parts = next_future_num_parts_in_parts;
6101 next_future_num_parts_in_parts = tmpPartVect;
6106 next_future_num_parts_in_parts->clear();
6108 if(this->mj_keep_part_boxes){
6109 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
6110 input_part_boxes = output_part_boxes;
6111 output_part_boxes = tmpPartBoxes;
6112 output_part_boxes->clear();
6116 mj_part_t output_part_count_in_dimension =
6117 this->update_part_num_arrays(
6118 num_partitioning_in_current_dim,
6119 future_num_part_in_parts,
6120 next_future_num_parts_in_parts,
6125 output_part_boxes, 1);
6130 if(output_part_count_in_dimension == current_num_parts) {
6132 tmpPartVect= future_num_part_in_parts;
6133 future_num_part_in_parts = next_future_num_parts_in_parts;
6134 next_future_num_parts_in_parts = tmpPartVect;
6136 if(this->mj_keep_part_boxes){
6137 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
6138 input_part_boxes = output_part_boxes;
6139 output_part_boxes = tmpPartBoxes;
6146 int coordInd = i % this->coord_dim;
6147 mj_scalar_t * mj_current_dim_coords = this->mj_coordinates[coordInd];
6150 std::string istring = Teuchos::toString<int>(i);
6151 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Problem_Partitioning_" + istring);
6155 this->new_part_xadj = allocMemory<mj_lno_t>(output_part_count_in_dimension);
6158 mj_part_t output_part_index = 0;
6161 mj_part_t output_coordinate_end_index = 0;
6163 mj_part_t current_work_part = 0;
6164 mj_part_t current_concurrent_num_parts =
6165 std::min(current_num_parts - current_work_part, this->max_concurrent_part_calculation);
6167 mj_part_t obtained_part_index = 0;
6170 for (; current_work_part < current_num_parts;
6171 current_work_part += current_concurrent_num_parts){
6173 current_concurrent_num_parts = std::min(current_num_parts - current_work_part,
6174 this->max_concurrent_part_calculation);
6176 mj_part_t actual_work_part_count = 0;
6180 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk){
6181 mj_part_t current_work_part_in_concurrent_parts = current_work_part + kk;
6185 if (num_partitioning_in_current_dim[current_work_part_in_concurrent_parts] == 1){
6188 ++actual_work_part_count;
6189 mj_lno_t coordinate_end_index= this->part_xadj[current_work_part_in_concurrent_parts];
6190 mj_lno_t coordinate_begin_index = current_work_part_in_concurrent_parts==0 ? 0: this->part_xadj[current_work_part_in_concurrent_parts -1];
6198 this->mj_get_local_min_max_coord_totW(
6199 coordinate_begin_index,
6200 coordinate_end_index,
6201 this->coordinate_permutations,
6202 mj_current_dim_coords,
6203 this->process_local_min_max_coord_total_weight[kk],
6204 this->process_local_min_max_coord_total_weight[kk + current_concurrent_num_parts],
6205 this->process_local_min_max_coord_total_weight[kk + 2*current_concurrent_num_parts]);
6210 if (actual_work_part_count > 0){
6212 this->mj_get_global_min_max_coord_totW(
6213 current_concurrent_num_parts,
6214 this->process_local_min_max_coord_total_weight,
6215 this->global_min_max_coord_total_weight);
6219 mj_part_t total_incomplete_cut_count = 0;
6224 mj_part_t concurrent_part_cut_shift = 0;
6225 mj_part_t concurrent_part_part_shift = 0;
6226 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk){
6227 mj_scalar_t min_coordinate = this->global_min_max_coord_total_weight[kk];
6228 mj_scalar_t max_coordinate = this->global_min_max_coord_total_weight[kk +
6229 current_concurrent_num_parts];
6231 mj_scalar_t global_total_weight = this->global_min_max_coord_total_weight[kk +
6232 2 * current_concurrent_num_parts];
6234 mj_part_t concurrent_current_part_index = current_work_part + kk;
6236 mj_part_t partition_count = num_partitioning_in_current_dim[concurrent_current_part_index];
6238 mj_scalar_t *usedCutCoordinate = current_cut_coordinates + concurrent_part_cut_shift;
6239 mj_scalar_t *current_target_part_weights = this->target_part_weights +
6240 concurrent_part_part_shift;
6242 concurrent_part_cut_shift += partition_count - 1;
6244 concurrent_part_part_shift += partition_count;
6249 if(partition_count > 1 && min_coordinate <= max_coordinate){
6253 total_incomplete_cut_count += partition_count - 1;
6256 this->my_incomplete_cut_count[kk] = partition_count - 1;
6259 this->mj_get_initial_cut_coords_target_weights(
6262 partition_count - 1,
6263 global_total_weight,
6265 current_target_part_weights,
6266 future_num_part_in_parts,
6267 next_future_num_parts_in_parts,
6268 concurrent_current_part_index,
6269 obtained_part_index);
6271 mj_lno_t coordinate_end_index= this->part_xadj[concurrent_current_part_index];
6272 mj_lno_t coordinate_begin_index = concurrent_current_part_index==0 ? 0: this->part_xadj[concurrent_current_part_index -1];
6276 this->set_initial_coordinate_parts(
6279 concurrent_current_part_index,
6280 coordinate_begin_index, coordinate_end_index,
6281 this->coordinate_permutations,
6282 mj_current_dim_coords,
6283 this->assigned_part_ids,
6288 this->my_incomplete_cut_count[kk] = 0;
6290 obtained_part_index += partition_count;
6297 double used_imbalance = 0;
6302 mj_current_dim_coords,
6305 current_concurrent_num_parts,
6306 current_cut_coordinates,
6307 total_incomplete_cut_count,
6308 num_partitioning_in_current_dim);
6313 mj_part_t output_array_shift = 0;
6314 mj_part_t cut_shift = 0;
6315 size_t tlr_shift = 0;
6316 size_t partweight_array_shift = 0;
6318 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk){
6319 mj_part_t current_concurrent_work_part = current_work_part + kk;
6320 mj_part_t num_parts = num_partitioning_in_current_dim[current_concurrent_work_part];
6323 if((num_parts != 1 )
6325 this->global_min_max_coord_total_weight[kk] >
6326 this->global_min_max_coord_total_weight[kk + current_concurrent_num_parts]) {
6330 for(mj_part_t jj = 0; jj < num_parts; ++jj){
6331 this->new_part_xadj[output_part_index + output_array_shift + jj] = 0;
6333 cut_shift += num_parts - 1;
6334 tlr_shift += (4 *(num_parts - 1) + 1);
6335 output_array_shift += num_parts;
6336 partweight_array_shift += (2 * (num_parts - 1) + 1);
6340 mj_lno_t coordinate_end= this->part_xadj[current_concurrent_work_part];
6341 mj_lno_t coordinate_begin = current_concurrent_work_part==0 ? 0: this->part_xadj[
6342 current_concurrent_work_part -1];
6343 mj_scalar_t *current_concurrent_cut_coordinate = current_cut_coordinates + cut_shift;
6344 mj_scalar_t *used_local_cut_line_weight_to_left = this->process_cut_line_weight_to_put_left +
6349 for(
int ii = 0; ii < this->num_threads; ++ii){
6350 this->thread_part_weight_work[ii] = this->thread_part_weights[ii] + partweight_array_shift;
6354 if(this->mj_keep_part_boxes){
6356 for (mj_part_t j = 0; j < num_parts - 1; ++j){
6357 (*output_part_boxes)[output_array_shift + output_part_index +
6358 j].updateMinMax(current_concurrent_cut_coordinate[j], 1
6361 (*output_part_boxes)[output_array_shift + output_part_index + j +
6362 1].updateMinMax(current_concurrent_cut_coordinate[j], 0
6368 this->mj_create_new_partitions(
6370 mj_current_dim_coords,
6371 current_concurrent_cut_coordinate,
6374 used_local_cut_line_weight_to_left,
6375 this->thread_part_weight_work,
6376 this->new_part_xadj + output_part_index + output_array_shift
6383 mj_lno_t part_size = coordinate_end - coordinate_begin;
6384 *(this->new_part_xadj + output_part_index + output_array_shift) = part_size;
6386 this->new_coordinate_permutations + coordinate_begin,
6387 this->coordinate_permutations + coordinate_begin,
6388 part_size *
sizeof(mj_lno_t));
6390 cut_shift += num_parts - 1;
6391 tlr_shift += (4 *(num_parts - 1) + 1);
6392 output_array_shift += num_parts;
6393 partweight_array_shift += (2 * (num_parts - 1) + 1);
6403 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
6404 mj_part_t num_parts = num_partitioning_in_current_dim[ current_work_part + kk];
6405 for (mj_part_t ii = 0;ii < num_parts ; ++ii){
6407 this->new_part_xadj[output_part_index+ii] += output_coordinate_end_index;
6410 output_coordinate_end_index = this->new_part_xadj[output_part_index + num_parts - 1];
6412 output_part_index += num_parts ;
6419 int current_world_size = this->comm->getSize();
6420 long migration_reduce_all_population = this->total_dim_num_reduce_all * current_world_size;
6423 bool is_migrated_in_current_dimension =
false;
6428 if (future_num_parts > 1 &&
6429 this->check_migrate_avoid_migration_option >= 0 &&
6430 current_world_size > 1){
6432 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Problem_Migration-" + istring);
6433 mj_part_t num_parts = output_part_count_in_dimension;
6434 if ( this->mj_perform_migration(
6437 next_future_num_parts_in_parts,
6438 output_part_begin_index,
6439 migration_reduce_all_population,
6440 this->num_global_coords / (future_num_parts * current_num_parts),
6442 input_part_boxes, output_part_boxes) ) {
6443 is_migrated_in_current_dimension =
true;
6444 is_data_ever_migrated =
true;
6445 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Problem_Migration-" +
6448 this->total_dim_num_reduce_all /= num_parts;
6451 is_migrated_in_current_dimension =
false;
6452 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Problem_Migration-" + istring);
6457 mj_lno_t * tmp = this->coordinate_permutations;
6458 this->coordinate_permutations = this->new_coordinate_permutations;
6459 this->new_coordinate_permutations = tmp;
6461 if(!is_migrated_in_current_dimension){
6462 this->total_dim_num_reduce_all -= current_num_parts;
6463 current_num_parts = output_part_count_in_dimension;
6465 freeArray<mj_lno_t>(this->part_xadj);
6466 this->part_xadj = this->new_part_xadj;
6467 this->new_part_xadj = NULL;
6468 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Problem_Partitioning_" + istring);
6472 delete future_num_part_in_parts;
6473 delete next_future_num_parts_in_parts;
6475 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Problem_Partitioning");
6482 this->set_final_parts(
6484 output_part_begin_index,
6486 is_data_ever_migrated);
6488 result_assigned_part_ids_ = this->assigned_part_ids;
6489 result_mj_gnos_ = this->current_mj_gnos;
6491 this->free_work_memory();
6492 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Total");
6493 this->mj_env->debug(3,
"Out of MultiJagged");
6501 template <
typename Adapter>
6506 #ifndef DOXYGEN_SHOULD_SKIP_THIS
6513 typedef typename Adapter::scalar_t adapter_scalar_t;
6516 typedef float default_mj_scalar_t;
6522 (std::is_same<adapter_scalar_t, float>::value ||
6523 std::is_same<adapter_scalar_t, double>::value),
6524 adapter_scalar_t, default_mj_scalar_t>::type mj_scalar_t;
6526 typedef typename Adapter::gno_t mj_gno_t;
6527 typedef typename Adapter::lno_t mj_lno_t;
6528 typedef typename Adapter::node_t mj_node_t;
6531 typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
6535 RCP<const Environment> mj_env;
6536 RCP<const Comm<int> > mj_problemComm;
6537 RCP<const coordinateModel_t> mj_coords;
6540 double imbalance_tolerance;
6541 size_t num_global_parts;
6542 mj_part_t *part_no_array;
6543 int recursion_depth;
6546 mj_lno_t num_local_coords;
6547 mj_gno_t num_global_coords;
6548 const mj_gno_t *initial_mj_gnos;
6549 mj_scalar_t **mj_coordinates;
6551 int num_weights_per_coord;
6552 bool *mj_uniform_weights;
6553 mj_scalar_t **mj_weights;
6554 bool *mj_uniform_parts;
6555 mj_scalar_t **mj_part_sizes;
6557 bool distribute_points_on_cut_lines;
6558 mj_part_t max_concurrent_part_calculation;
6559 int check_migrate_avoid_migration_option;
6562 double minimum_migration_imbalance;
6563 bool mj_keep_part_boxes;
6568 int mj_premigration_option;
6569 int min_coord_per_rank_for_premigration;
6571 ArrayRCP<mj_part_t> comXAdj_;
6572 ArrayRCP<mj_part_t> comAdj_;
6578 ArrayRCP<const mj_scalar_t> * coordinate_ArrayRCP_holder;
6580 void set_up_partitioning_data(
6583 void set_input_parameters(
const Teuchos::ParameterList &p);
6585 void free_work_memory();
6587 RCP<mj_partBoxVector_t> getGlobalBoxBoundaries()
const;
6589 bool mj_premigrate_to_subset(
int used_num_ranks,
int migration_selection_option,
6590 RCP<const Environment> mj_env_,
6591 RCP<
const Comm<int> > mj_problemComm_,
6593 mj_lno_t num_local_coords_,
6594 mj_gno_t num_global_coords_,
size_t num_global_parts_,
6595 const mj_gno_t *initial_mj_gnos_,
6596 mj_scalar_t **mj_coordinates_,
6597 int num_weights_per_coord_,
6598 mj_scalar_t **mj_weights_,
6600 RCP<
const Comm<int> > &result_problemComm_,
6601 mj_lno_t & result_num_local_coords_,
6602 mj_gno_t * &result_initial_mj_gnos_,
6603 mj_scalar_t ** &result_mj_coordinates_,
6604 mj_scalar_t ** &result_mj_weights_,
6605 int * &result_actual_owner_rank_);
6610 RCP<
const Comm<int> > &problemComm,
6611 const RCP<const coordinateModel_t> &coords) :
6612 mj_partitioner(), mj_env(env),
6613 mj_problemComm(problemComm),
6615 imbalance_tolerance(0),
6616 num_global_parts(1), part_no_array(NULL),
6618 coord_dim(0),num_local_coords(0), num_global_coords(0),
6619 initial_mj_gnos(NULL), mj_coordinates(NULL),
6620 num_weights_per_coord(0),
6621 mj_uniform_weights(NULL), mj_weights(NULL),
6622 mj_uniform_parts(NULL),
6623 mj_part_sizes(NULL),
6624 distribute_points_on_cut_lines(true),
6625 max_concurrent_part_calculation(1),
6626 check_migrate_avoid_migration_option(0), migration_type(0),
6627 minimum_migration_imbalance(0.30),
6628 mj_keep_part_boxes(false), num_threads(1), mj_run_as_rcb(false),mj_premigration_option(0), min_coord_per_rank_for_premigration(32000),
6629 comXAdj_(), comAdj_(), coordinate_ArrayRCP_holder (NULL)
6633 if (coordinate_ArrayRCP_holder != NULL){
6634 delete [] this->coordinate_ArrayRCP_holder;
6635 this->coordinate_ArrayRCP_holder = NULL;
6643 const bool bUnsorted =
true;
6644 RCP<Zoltan2::IntegerRangeListValidator<int>> mj_parts_Validator =
6646 pl.set(
"mj_parts",
"0",
"list of parts for multiJagged partitioning "
6647 "algorithm. As many as the dimension count.", mj_parts_Validator);
6649 pl.set(
"mj_concurrent_part_count", 1,
"The number of parts whose cut "
6652 pl.set(
"mj_minimum_migration_imbalance", 1.1,
6653 "mj_minimum_migration_imbalance, the minimum imbalance of the "
6654 "processors to avoid migration",
6657 RCP<Teuchos::EnhancedNumberValidator<int>> mj_migration_option_validator =
6658 Teuchos::rcp(
new Teuchos::EnhancedNumberValidator<int>(0, 2) );
6659 pl.set(
"mj_migration_option", 1,
"Migration option, 0 for decision "
6660 "depending on the imbalance, 1 for forcing migration, 2 for "
6661 "avoiding migration", mj_migration_option_validator);
6666 RCP<Teuchos::EnhancedNumberValidator<int>> mj_migration_type_validator =
6667 Teuchos::rcp(
new Teuchos::EnhancedNumberValidator<int>(0, 1) );
6668 pl.set(
"mj_migration_type", 0,
"Migration type, 0 for migration to minimize the imbalance "
6669 "1 for migration to minimize messages exchanged the migration." ,
6670 mj_migration_option_validator);
6673 pl.set(
"mj_keep_part_boxes",
false,
"Keep the part boundaries of the "
6677 pl.set(
"mj_enable_rcb",
false,
"Use MJ as RCB.",
6680 pl.set(
"mj_recursion_depth", -1,
"Recursion depth for MJ: Must be "
6683 RCP<Teuchos::EnhancedNumberValidator<int>> mj_premigration_option_validator =
6684 Teuchos::rcp(
new Teuchos::EnhancedNumberValidator<int>(0, 1024) );
6686 pl.set(
"mj_premigration_option", 0,
"Whether to do premigration or not. 0 for no migration "
6687 "x > 0 for migration to consecutive processors, the subset will be 0,x,2x,3x,...subset ranks."
6688 , mj_premigration_option_validator);
6690 pl.set(
"mj_premigration_coordinate_count", 32000,
"How many coordinate to assign each rank in multijagged after premigration"
6705 RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
6709 mj_part_t
pointAssign(
int dim, adapter_scalar_t *point)
const;
6711 void boxAssign(
int dim, adapter_scalar_t *lower, adapter_scalar_t *upper,
6712 size_t &nPartsFound, mj_part_t **partsFound)
const;
6719 ArrayRCP<mj_part_t> &comXAdj,
6720 ArrayRCP<mj_part_t> &comAdj);
6726 template <
typename Adapter>
6727 bool Zoltan2_AlgMJ<Adapter>::mj_premigrate_to_subset(
int used_num_ranks,
6729 RCP<const Environment> mj_env_,
6730 RCP<
const Comm<int> > mj_problemComm_,
6732 mj_lno_t num_local_coords_,
6734 const mj_gno_t *initial_mj_gnos_,
6735 mj_scalar_t **mj_coordinates_,
6736 int num_weights_per_coord_,
6737 mj_scalar_t **mj_weights_,
6739 RCP<
const Comm<int> > &result_problemComm_,
6740 mj_lno_t &result_num_local_coords_,
6741 mj_gno_t * &result_initial_mj_gnos_,
6742 mj_scalar_t ** &result_mj_coordinates_,
6743 mj_scalar_t ** &result_mj_weights_,
6744 int * &result_actual_owner_rank_){
6745 mj_env_->timerStart(
MACRO_TIMERS,
"MultiJagged - PreMigration DistributorPlanCreating");
6748 int myRank = mj_problemComm_->getRank();
6749 int worldSize = mj_problemComm_->getSize();
6751 mj_part_t groupsize = worldSize / used_num_ranks;
6755 std::vector<mj_part_t> group_begins(used_num_ranks + 1, 0);
6757 mj_part_t i_am_sending_to = 0;
6758 bool am_i_a_receiver =
false;
6760 for(
int i = 0; i < used_num_ranks; ++i){
6761 group_begins[i+ 1] = group_begins[i] + groupsize;
6762 if (worldSize % used_num_ranks > i) group_begins[i+ 1] += 1;
6763 if (i == used_num_ranks) group_begins[i+ 1] = worldSize;
6764 if (myRank >= group_begins[i] && myRank < group_begins[i + 1]) i_am_sending_to = group_begins[i];
6765 if (myRank == group_begins[i]) am_i_a_receiver=
true;
6768 ArrayView<const mj_part_t> idView(&(group_begins[0]), used_num_ranks );
6769 result_problemComm_ = mj_problemComm_->createSubcommunicator(idView);
6772 Tpetra::Distributor distributor(mj_problemComm_);
6774 std::vector<mj_part_t> coordinate_destinations(num_local_coords_, i_am_sending_to);
6775 ArrayView<const mj_part_t> destinations( &(coordinate_destinations[0]), num_local_coords_);
6776 mj_lno_t num_incoming_gnos = distributor.createFromSends(destinations);
6777 result_num_local_coords_ = num_incoming_gnos;
6778 mj_env_->timerStop(
MACRO_TIMERS,
"MultiJagged - PreMigration DistributorPlanCreating");
6780 mj_env_->timerStart(
MACRO_TIMERS,
"MultiJagged - PreMigration DistributorMigration");
6784 ArrayRCP<mj_gno_t> received_gnos(num_incoming_gnos);
6786 ArrayView<const mj_gno_t> sent_gnos(initial_mj_gnos_, num_local_coords_);
6787 distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
6789 result_initial_mj_gnos_ = allocMemory<mj_gno_t>(num_incoming_gnos);
6791 result_initial_mj_gnos_,
6792 received_gnos.getRawPtr(),
6793 num_incoming_gnos *
sizeof(mj_gno_t));
6797 result_mj_coordinates_ = allocMemory<mj_scalar_t *>(coord_dim_);
6798 for (
int i = 0; i < coord_dim_; ++i){
6799 ArrayView<const mj_scalar_t> sent_coord(mj_coordinates_[i], num_local_coords_);
6800 ArrayRCP<mj_scalar_t> received_coord(num_incoming_gnos);
6801 distributor.doPostsAndWaits<mj_scalar_t>(sent_coord, 1, received_coord());
6802 result_mj_coordinates_[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
6804 result_mj_coordinates_[i],
6805 received_coord.getRawPtr(),
6806 num_incoming_gnos *
sizeof(mj_scalar_t));
6809 result_mj_weights_ = allocMemory<mj_scalar_t *>(num_weights_per_coord_);
6811 for (
int i = 0; i < num_weights_per_coord_; ++i){
6812 ArrayView<const mj_scalar_t> sent_weight(mj_weights_[i], num_local_coords_);
6813 ArrayRCP<mj_scalar_t> received_weight(num_incoming_gnos);
6814 distributor.doPostsAndWaits<mj_scalar_t>(sent_weight, 1, received_weight());
6815 result_mj_weights_[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
6817 result_mj_weights_[i],
6818 received_weight.getRawPtr(),
6819 num_incoming_gnos *
sizeof(mj_scalar_t));
6824 std::vector<int> owner_of_coordinate(num_local_coords_, myRank);
6825 ArrayView<int> sent_owners(&(owner_of_coordinate[0]), num_local_coords_);
6826 ArrayRCP<int> received_owners(num_incoming_gnos);
6827 distributor.doPostsAndWaits<
int>(sent_owners, 1, received_owners());
6828 result_actual_owner_rank_ = allocMemory<int>(num_incoming_gnos);
6830 result_actual_owner_rank_,
6831 received_owners.getRawPtr(),
6832 num_incoming_gnos *
sizeof(int));
6834 mj_env_->timerStop(
MACRO_TIMERS,
"MultiJagged - PreMigration DistributorMigration");
6835 return am_i_a_receiver;
6853 template <
typename Adapter>
6858 this->set_up_partitioning_data(solution);
6859 this->set_input_parameters(this->mj_env->getParameters());
6860 if (this->mj_keep_part_boxes){
6861 this->mj_partitioner.set_to_keep_part_boxes();
6863 this->mj_partitioner.set_partitioning_parameters(
6864 this->distribute_points_on_cut_lines,
6865 this->max_concurrent_part_calculation,
6866 this->check_migrate_avoid_migration_option,
6867 this->minimum_migration_imbalance, this->migration_type);
6870 RCP<const Comm<int> > result_problemComm = this->mj_problemComm;
6871 mj_lno_t result_num_local_coords = this->num_local_coords;
6872 mj_gno_t * result_initial_mj_gnos = NULL;
6873 mj_scalar_t **result_mj_coordinates = this->mj_coordinates;
6874 mj_scalar_t **result_mj_weights = this->mj_weights;
6875 int *result_actual_owner_rank = NULL;
6876 const mj_gno_t * result_initial_mj_gnos_ = this->initial_mj_gnos;
6893 int current_world_size = this->mj_problemComm->getSize();
6894 mj_lno_t threshold_num_local_coords = this->min_coord_per_rank_for_premigration;
6895 bool is_pre_migrated =
false;
6896 bool am_i_in_subset =
true;
6897 if ( mj_premigration_option > 0 &&
6898 size_t (current_world_size) > this->num_global_parts &&
6899 this->num_global_coords < mj_gno_t (current_world_size * threshold_num_local_coords)){
6900 if (this->mj_keep_part_boxes){
6901 throw std::logic_error(
"Multijagged: mj_keep_part_boxes and mj_premigration_option are not supported together yet.");
6903 is_pre_migrated =
true;
6904 int migration_selection_option = mj_premigration_option;
6905 if(migration_selection_option * this->num_global_parts > (
size_t) (current_world_size)){
6906 migration_selection_option = current_world_size / this->num_global_parts;
6908 int used_num_ranks = int (this->num_global_coords /
float (threshold_num_local_coords) + 0.5);
6909 if (used_num_ranks == 0) used_num_ranks = 1;
6911 am_i_in_subset = this->mj_premigrate_to_subset(
6913 migration_selection_option,
6915 this->mj_problemComm,
6917 this->num_local_coords,
6918 this->num_global_coords,
6919 this->num_global_parts,
6920 this->initial_mj_gnos,
6921 this->mj_coordinates,
6922 this->num_weights_per_coord,
6926 result_num_local_coords,
6927 result_initial_mj_gnos,
6928 result_mj_coordinates,
6930 result_actual_owner_rank);
6931 result_initial_mj_gnos_ = result_initial_mj_gnos;
6936 mj_part_t *result_assigned_part_ids = NULL;
6937 mj_gno_t *result_mj_gnos = NULL;
6939 if (am_i_in_subset){
6940 this->mj_partitioner.multi_jagged_part(
6944 this->imbalance_tolerance,
6945 this->num_global_parts,
6946 this->part_no_array,
6947 this->recursion_depth,
6950 result_num_local_coords,
6951 this->num_global_coords,
6952 result_initial_mj_gnos_,
6953 result_mj_coordinates,
6955 this->num_weights_per_coord,
6956 this->mj_uniform_weights,
6958 this->mj_uniform_parts,
6959 this->mj_part_sizes,
6961 result_assigned_part_ids,
6969 #if defined(__cplusplus) && __cplusplus >= 201103L
6970 std::unordered_map<mj_gno_t, mj_lno_t> localGidToLid;
6971 localGidToLid.reserve(result_num_local_coords);
6972 for (mj_lno_t i = 0; i < result_num_local_coords; i++)
6973 localGidToLid[result_initial_mj_gnos_[i]] = i;
6974 ArrayRCP<mj_part_t> partId = arcp(
new mj_part_t[result_num_local_coords],
6975 0, result_num_local_coords,
true);
6977 for (mj_lno_t i = 0; i < result_num_local_coords; i++) {
6978 mj_lno_t origLID = localGidToLid[result_mj_gnos[i]];
6979 partId[origLID] = result_assigned_part_ids[i];
6983 Teuchos::Hashtable<mj_gno_t, mj_lno_t>
6984 localGidToLid(result_num_local_coords);
6985 for (mj_lno_t i = 0; i < result_num_local_coords; i++)
6986 localGidToLid.put(result_initial_mj_gnos_[i], i);
6988 ArrayRCP<mj_part_t> partId = arcp(
new mj_part_t[result_num_local_coords],
6989 0, result_num_local_coords,
true);
6991 for (mj_lno_t i = 0; i < result_num_local_coords; i++) {
6992 mj_lno_t origLID = localGidToLid.get(result_mj_gnos[i]);
6993 partId[origLID] = result_assigned_part_ids[i];
6996 #endif // C++11 is enabled
6998 delete [] result_mj_gnos;
6999 delete [] result_assigned_part_ids;
7004 if (is_pre_migrated){
7005 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - PostMigration DistributorPlanCreating");
7006 Tpetra::Distributor distributor(this->mj_problemComm);
7008 ArrayView<const mj_part_t> actual_owner_destinations( result_actual_owner_rank , result_num_local_coords);
7009 mj_lno_t num_incoming_gnos = distributor.createFromSends(actual_owner_destinations);
7010 if (num_incoming_gnos != this->num_local_coords){
7011 throw std::logic_error(
"Zoltan2 - Multijagged Post Migration - num incoming is not equal to num local coords");
7013 mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - PostMigration DistributorPlanCreating");
7014 mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - PostMigration DistributorMigration");
7015 ArrayRCP<mj_gno_t> received_gnos(num_incoming_gnos);
7016 ArrayRCP<mj_part_t> received_partids(num_incoming_gnos);
7018 ArrayView<const mj_gno_t> sent_gnos(result_initial_mj_gnos_, result_num_local_coords);
7019 distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
7022 ArrayView<mj_part_t> sent_partnos(partId());
7023 distributor.doPostsAndWaits<mj_part_t>(sent_partnos, 1, received_partids());
7025 partId = arcp(
new mj_part_t[this->num_local_coords],
7026 0, this->num_local_coords,
true);
7029 #if defined(__cplusplus) && __cplusplus >= 201103L
7030 std::unordered_map<mj_gno_t, mj_lno_t> localGidToLid2;
7031 localGidToLid2.reserve(this->num_local_coords);
7032 for (mj_lno_t i = 0; i < this->num_local_coords; i++)
7033 localGidToLid2[this->initial_mj_gnos[i]] = i;
7036 for (mj_lno_t i = 0; i < this->num_local_coords; i++) {
7037 mj_lno_t origLID = localGidToLid2[received_gnos[i]];
7038 partId[origLID] = received_partids[i];
7042 Teuchos::Hashtable<mj_gno_t, mj_lno_t>
7043 localGidToLid2(this->num_local_coords);
7044 for (mj_lno_t i = 0; i < this->num_local_coords; i++)
7045 localGidToLid2.put(this->initial_mj_gnos[i], i);
7048 for (mj_lno_t i = 0; i < this->num_local_coords; i++) {
7049 mj_lno_t origLID = localGidToLid2.get(received_gnos[i]);
7050 partId[origLID] = received_partids[i];
7053 #endif // C++11 is enabled
7058 freeArray<mj_gno_t> (result_initial_mj_gnos);
7059 for (
int i = 0; i < this->coord_dim; ++i){
7060 freeArray<mj_scalar_t> (result_mj_coordinates[i]);
7062 freeArray<mj_scalar_t *> (result_mj_coordinates);
7064 for (
int i = 0; i < this->num_weights_per_coord; ++i){
7065 freeArray<mj_scalar_t> (result_mj_weights[i]);
7067 freeArray<mj_scalar_t *> (result_mj_weights);
7068 freeArray<int> (result_actual_owner_rank);
7070 mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - PostMigration DistributorMigration");
7074 solution->setParts(partId);
7075 this->free_work_memory();
7080 template <
typename Adapter>
7082 freeArray<mj_scalar_t *>(this->mj_coordinates);
7083 freeArray<mj_scalar_t *>(this->mj_weights);
7084 freeArray<bool>(this->mj_uniform_parts);
7085 freeArray<mj_scalar_t *>(this->mj_part_sizes);
7086 freeArray<bool>(this->mj_uniform_weights);
7092 template <
typename Adapter>
7093 void Zoltan2_AlgMJ<Adapter>::set_up_partitioning_data(
7094 const RCP<PartitioningSolution<Adapter> > &solution
7097 this->coord_dim = this->mj_coords->getCoordinateDim();
7098 this->num_weights_per_coord = this->mj_coords->getNumWeightsPerCoordinate();
7099 this->num_local_coords = this->mj_coords->getLocalNumCoordinates();
7100 this->num_global_coords = this->mj_coords->getGlobalNumCoordinates();
7101 int criteria_dim = (this->num_weights_per_coord ? this->num_weights_per_coord : 1);
7106 this->num_global_parts = solution->getTargetGlobalNumberOfParts();
7109 this->mj_coordinates = allocMemory<mj_scalar_t *>(this->coord_dim);
7110 this->mj_weights = allocMemory<mj_scalar_t *>(criteria_dim);
7113 this->mj_uniform_parts = allocMemory< bool >(criteria_dim);
7116 this->mj_part_sizes = allocMemory<mj_scalar_t *>(criteria_dim);
7118 this->mj_uniform_weights = allocMemory< bool >(criteria_dim);
7120 typedef StridedData<mj_lno_t, adapter_scalar_t> input_t;
7121 ArrayView<const mj_gno_t> gnos;
7122 ArrayView<input_t> xyz;
7123 ArrayView<input_t> wgts;
7126 this->coordinate_ArrayRCP_holder =
new ArrayRCP<const mj_scalar_t> [this->coord_dim + this->num_weights_per_coord];
7128 this->mj_coords->getCoordinates(gnos, xyz, wgts);
7130 ArrayView<const mj_gno_t> mj_gnos = gnos;
7131 this->initial_mj_gnos = mj_gnos.getRawPtr();
7134 for (
int dim=0; dim < this->coord_dim; dim++){
7135 ArrayRCP<const mj_scalar_t> ar;
7136 xyz[dim].getInputArray(ar);
7138 this->coordinate_ArrayRCP_holder[dim] = ar;
7141 this->mj_coordinates[dim] = (mj_scalar_t *)ar.getRawPtr();
7145 if (this->num_weights_per_coord == 0){
7146 this->mj_uniform_weights[0] =
true;
7147 this->mj_weights[0] = NULL;
7151 for (
int wdim = 0; wdim < this->num_weights_per_coord; wdim++){
7152 ArrayRCP<const mj_scalar_t> ar;
7153 wgts[wdim].getInputArray(ar);
7156 this->coordinate_ArrayRCP_holder[this->coord_dim + wdim] = ar;
7157 this->mj_uniform_weights[wdim] =
false;
7158 this->mj_weights[wdim] = (mj_scalar_t *) ar.getRawPtr();
7162 for (
int wdim = 0; wdim < criteria_dim; wdim++){
7163 if (solution->criteriaHasUniformPartSizes(wdim)){
7164 this->mj_uniform_parts[wdim] =
true;
7165 this->mj_part_sizes[wdim] = NULL;
7168 std::cerr <<
"MJ does not support non uniform target part weights" << std::endl;
7177 template <
typename Adapter>
7178 void Zoltan2_AlgMJ<Adapter>::set_input_parameters(
const Teuchos::ParameterList &pl){
7180 const Teuchos::ParameterEntry *pe = pl.getEntryPtr(
"imbalance_tolerance");
7183 tol = pe->getValue(&tol);
7184 this->imbalance_tolerance = tol - 1.0;
7188 if (this->imbalance_tolerance <= 0)
7189 this->imbalance_tolerance= 10e-4;
7192 this->part_no_array = NULL;
7194 this->recursion_depth = 0;
7196 if (pl.getPtr<Array <mj_part_t> >(
"mj_parts")){
7197 this->part_no_array = (mj_part_t *) pl.getPtr<Array <mj_part_t> >(
"mj_parts")->getRawPtr();
7198 this->recursion_depth = pl.getPtr<Array <mj_part_t> >(
"mj_parts")->size() - 1;
7199 this->mj_env->debug(2,
"mj_parts provided by user");
7203 this->distribute_points_on_cut_lines =
true;
7204 this->max_concurrent_part_calculation = 1;
7206 this->mj_run_as_rcb =
false;
7207 this->mj_premigration_option = 0;
7208 this->min_coord_per_rank_for_premigration = 32000;
7210 int mj_user_recursion_depth = -1;
7211 this->mj_keep_part_boxes =
false;
7212 this->check_migrate_avoid_migration_option = 0;
7213 this->migration_type = 0;
7214 this->minimum_migration_imbalance = 0.35;
7216 pe = pl.getEntryPtr(
"mj_minimum_migration_imbalance");
7219 imb = pe->getValue(&imb);
7220 this->minimum_migration_imbalance = imb - 1.0;
7223 pe = pl.getEntryPtr(
"mj_migration_option");
7225 this->check_migrate_avoid_migration_option = pe->getValue(&this->check_migrate_avoid_migration_option);
7227 this->check_migrate_avoid_migration_option = 0;
7229 if (this->check_migrate_avoid_migration_option > 1) this->check_migrate_avoid_migration_option = -1;
7232 pe = pl.getEntryPtr(
"mj_migration_type");
7234 this->migration_type = pe->getValue(&this->migration_type);
7236 this->migration_type = 0;
7241 pe = pl.getEntryPtr(
"mj_concurrent_part_count");
7243 this->max_concurrent_part_calculation = pe->getValue(&this->max_concurrent_part_calculation);
7245 this->max_concurrent_part_calculation = 1;
7248 pe = pl.getEntryPtr(
"mj_keep_part_boxes");
7250 this->mj_keep_part_boxes = pe->getValue(&this->mj_keep_part_boxes);
7252 this->mj_keep_part_boxes =
false;
7264 if (this->mj_keep_part_boxes ==
false){
7265 pe = pl.getEntryPtr(
"mapping_type");
7267 int mapping_type = -1;
7268 mapping_type = pe->getValue(&mapping_type);
7269 if (mapping_type == 0){
7270 mj_keep_part_boxes =
true;
7276 pe = pl.getEntryPtr(
"mj_enable_rcb");
7278 this->mj_run_as_rcb = pe->getValue(&this->mj_run_as_rcb);
7280 this->mj_run_as_rcb =
false;
7283 pe = pl.getEntryPtr(
"mj_premigration_option");
7285 mj_premigration_option = pe->getValue(&mj_premigration_option);
7287 mj_premigration_option = 0;
7290 pe = pl.getEntryPtr(
"mj_premigration_coordinate_count");
7292 min_coord_per_rank_for_premigration = pe->getValue(&mj_premigration_option);
7294 min_coord_per_rank_for_premigration = 32000;
7297 pe = pl.getEntryPtr(
"mj_recursion_depth");
7299 mj_user_recursion_depth = pe->getValue(&mj_user_recursion_depth);
7301 mj_user_recursion_depth = -1;
7305 pe = pl.getEntryPtr(
"rectilinear");
7306 if (pe) val = pe->getValue(&val);
7308 this->distribute_points_on_cut_lines =
false;
7310 this->distribute_points_on_cut_lines =
true;
7313 if (this->mj_run_as_rcb){
7314 mj_user_recursion_depth = (int)(ceil(log ((this->num_global_parts)) / log (2.0)));
7316 if (this->recursion_depth < 1){
7317 if (mj_user_recursion_depth > 0){
7318 this->recursion_depth = mj_user_recursion_depth;
7321 this->recursion_depth = this->coord_dim;
7325 this->num_threads = 1;
7326 #ifdef HAVE_ZOLTAN2_OMP
7327 #pragma omp parallel
7329 this->num_threads = omp_get_num_threads();
7336 template <
typename Adapter>
7339 adapter_scalar_t *lower,
7340 adapter_scalar_t *upper,
7341 size_t &nPartsFound,
7351 if (this->mj_keep_part_boxes) {
7354 RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
7356 size_t nBoxes = (*partBoxes).size();
7358 throw std::logic_error(
"no part boxes exist");
7362 RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
7364 if (globalBox->boxesOverlap(dim, lower, upper)) {
7366 std::vector<typename Adapter::part_t> partlist;
7369 for (
size_t i = 0; i < nBoxes; i++) {
7371 if ((*partBoxes)[i].boxesOverlap(dim, lower, upper)) {
7373 partlist.push_back((*partBoxes)[i].getpId());
7394 *partsFound =
new mj_part_t[nPartsFound];
7395 for (
size_t i = 0; i < nPartsFound; i++)
7396 (*partsFound)[i] = partlist[i];
7409 throw std::logic_error(
"need to use keep_cuts parameter for boxAssign");
7414 template <
typename Adapter>
7417 adapter_scalar_t *point)
const
7424 if (this->mj_keep_part_boxes) {
7428 RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
7430 size_t nBoxes = (*partBoxes).size();
7432 throw std::logic_error(
"no part boxes exist");
7436 RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
7438 if (globalBox->pointInBox(dim, point)) {
7442 for (i = 0; i < nBoxes; i++) {
7444 if ((*partBoxes)[i].pointInBox(dim, point)) {
7445 foundPart = (*partBoxes)[i].getpId();
7459 std::ostringstream oss;
7461 for (
int j = 0; j < dim; j++) oss << point[j] <<
" ";
7462 oss <<
") not found in domain";
7463 throw std::logic_error(oss.str());
7473 size_t closestBox = 0;
7474 coord_t minDistance = std::numeric_limits<coord_t>::max();
7475 coord_t *centroid =
new coord_t[dim];
7476 for (
size_t i = 0; i < nBoxes; i++) {
7477 (*partBoxes)[i].computeCentroid(centroid);
7480 for (
int j = 0; j < dim; j++) {
7481 diff = centroid[j] - point[j];
7484 if (sum < minDistance) {
7489 foundPart = (*partBoxes)[closestBox].getpId();
7496 throw std::logic_error(
"need to use keep_cuts parameter for pointAssign");
7500 template <
typename Adapter>
7506 if(comXAdj_.getRawPtr() == NULL && comAdj_.getRawPtr() == NULL){
7507 RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
7508 mj_part_t ntasks = (*pBoxes).size();
7509 int dim = (*pBoxes)[0].getDim();
7510 GridHash grid(pBoxes, ntasks, dim);
7518 template <
typename Adapter>
7519 RCP<typename Zoltan2_AlgMJ<Adapter>::mj_partBoxVector_t>
7522 return this->mj_partitioner.get_kept_boxes();
7526 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
7528 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::mj_partBoxVector_t>
7531 if (this->mj_keep_part_boxes)
7532 return this->kept_boxes;
7534 throw std::logic_error(
"Error: part boxes are not stored.");
7537 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
7539 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::mj_partBoxVector_t>
7541 RCP<mj_partBoxVector_t> &localPartBoxes
7545 mj_part_t ntasks = this->num_global_parts;
7546 int dim = (*localPartBoxes)[0].getDim();
7547 coord_t *localPartBoundaries =
new coord_t[ntasks * 2 *dim];
7549 memset(localPartBoundaries, 0,
sizeof(coord_t) * ntasks * 2 *dim);
7551 coord_t *globalPartBoundaries =
new coord_t[ntasks * 2 *dim];
7552 memset(globalPartBoundaries, 0,
sizeof(coord_t) * ntasks * 2 *dim);
7554 coord_t *localPartMins = localPartBoundaries;
7555 coord_t *localPartMaxs = localPartBoundaries + ntasks * dim;
7557 coord_t *globalPartMins = globalPartBoundaries;
7558 coord_t *globalPartMaxs = globalPartBoundaries + ntasks * dim;
7560 mj_part_t boxCount = localPartBoxes->size();
7561 for (mj_part_t i = 0; i < boxCount; ++i){
7562 mj_part_t pId = (*localPartBoxes)[i].getpId();
7565 coord_t *lmins = (*localPartBoxes)[i].getlmins();
7566 coord_t *lmaxs = (*localPartBoxes)[i].getlmaxs();
7568 for (
int j = 0; j < dim; ++j){
7569 localPartMins[dim * pId + j] = lmins[j];
7570 localPartMaxs[dim * pId + j] = lmaxs[j];
7582 reduceAll<int, coord_t>(*mj_problemComm, reductionOp,
7583 ntasks * 2 *dim, localPartBoundaries, globalPartBoundaries);
7584 RCP<mj_partBoxVector_t> pB(
new mj_partBoxVector_t(),
true);
7585 for (mj_part_t i = 0; i < ntasks; ++i){
7587 globalPartMaxs + dim * i);
7599 delete []localPartBoundaries;
7600 delete []globalPartBoundaries;
#define MIN_WORK_LAST_DIM
GridHash Class, Hashing Class for part boxes.
Time an algorithm (or other entity) as a whole.
void set(IT index_, CT count_, WT *vals_)
#define Z2_FORWARD_EXCEPTIONS
Forward an exception back through call stack.
Defines Parameter related enumerators, declares functions.
AlgMJ()
Multi Jagged coordinate partitioning algorithm default constructor.
static RCP< Teuchos::BoolParameterEntryValidator > getBoolValidator()
Exists to make setting up validators less cluttered.
void getAdjArrays(ArrayRCP< part_t > &comXAdj_, ArrayRCP< part_t > &comAdj_)
GridHash Class, returns the adj arrays.
Zoltan2_BoxBoundaries(Ordinal s_)
Constructor.
Sort items for quick sort function.
void uqSignsort(IT n, uSignedSortItem< IT, WT, SIGN > *arr)
Quick sort function. Sorts the arr of uSignedSortItems, with respect to increasing vals...
#define imbalanceOf2(Wachieved, wExpected)
void sequential_task_partitioning(const RCP< const Environment > &env, mj_lno_t num_total_coords, mj_lno_t num_selected_coords, size_t num_target_part, int coord_dim, mj_scalar_t **mj_coordinates, mj_lno_t *initial_selected_coords_output_permutation, mj_lno_t *output_xadj, int recursion_depth, const mj_part_t *part_no_array, bool partition_along_longest_dim, int num_ranks_per_node, bool divide_to_prime_first_)
Special function for partitioning for task mapping. Runs sequential, and performs deterministic parti...
RCP< mj_partBoxVector_t > get_kept_boxes() const
void freeArray(T *&array)
Frees the given array.
Class for sorting items with multiple values. First sorting with respect to val[0], then val[1] then ... val[count-1]. The last tie breaking is done with index values. Used for task mapping partitioning where the points on a cut line needs to be distributed consistently.
void multi_jagged_part(const RCP< const Environment > &env, RCP< const Comm< int > > &problemComm, double imbalance_tolerance, size_t num_global_parts, mj_part_t *part_no_array, int recursion_depth, int coord_dim, mj_lno_t num_local_coords, mj_gno_t num_global_coords, const mj_gno_t *initial_mj_gnos, mj_scalar_t **mj_coordinates, int num_weights_per_coord, bool *mj_uniform_weights, mj_scalar_t **mj_weights, bool *mj_uniform_parts, mj_scalar_t **mj_part_sizes, mj_part_t *&result_assigned_part_ids, mj_gno_t *&result_mj_gnos)
Multi Jagged coordinate partitioning algorithm.
static RCP< Teuchos::AnyNumberParameterEntryValidator > getAnyDoubleValidator()
Exists to make setting up validators less cluttered.
void partition(const RCP< PartitioningSolution< Adapter > > &solution)
Multi Jagged coordinate partitioning algorithm.
void reduce(const Ordinal count, const T inBuffer[], T inoutBuffer[]) const
Implement Teuchos::ValueTypeReductionOp interface.
coordinateModelPartBox Class, represents the boundaries of the box which is a result of a geometric p...
A ParameterList validator for integer range lists.
void set_to_keep_part_boxes()
Function call, if the part boxes are intended to be kept.
void getCommunicationGraph(const PartitioningSolution< Adapter > *solution, ArrayRCP< mj_part_t > &comXAdj, ArrayRCP< mj_part_t > &comAdj)
returns communication graph resulting from MJ partitioning.
bool operator>=(const uSignedSortItem< IT, WT, SIGN > &rhs)
SparseMatrixAdapter_t::part_t part_t
#define FUTURE_REDUCEALL_CUTOFF
Multi Jagged coordinate partitioning algorithm.
This class provides geometric coordinates with optional weights to the Zoltan2 algorithm.
void boxAssign(int dim, adapter_scalar_t *lower, adapter_scalar_t *upper, size_t &nPartsFound, mj_part_t **partsFound) const
T * allocMemory(size_t size)
Allocates memory for the given size.
uMultiSortItem< IT, CT, WT > operator=(const uMultiSortItem< IT, CT, WT > &other)
A PartitioningSolution is a solution to a partitioning problem.
Zoltan2_BoxBoundaries()
Default Constructor.
RCP< mj_partBoxVector_t > compute_global_box_boundaries(RCP< mj_partBoxVector_t > &localPartBoxes) const
uMultiSortItem(const uMultiSortItem< IT, CT, WT > &other)
#define LEAST_SIGNIFICANCE
Zoltan2_BoxBoundaries is a reduction operation to all reduce the all box boundaries.
Algorithm defines the base class for all algorithms.
mj_partBoxVector_t & getPartBoxesView() const
for partitioning methods, return bounding boxes of the
#define Z2_ASSERT_VALUE(actual, expected)
Throw an error when actual value is not equal to expected value.
static RCP< Teuchos::AnyNumberParameterEntryValidator > getAnyIntValidator()
Exists to make setting up validators less cluttered.
RCP< mj_partBox_t > get_global_box() const
Return the global bounding box: min/max coords of global domain.
uMultiSortItem(IT index_, CT count_, WT *vals_)
Define IntegerRangeList validator.
Defines the CoordinateModel classes.
bool operator>(const uSignedSortItem< IT, WT, SIGN > &rhs) const
bool operator>(const uMultiSortItem< IT, CT, WT > &other) const
Tpetra::global_size_t global_size_t
mj_part_t pointAssign(int dim, adapter_scalar_t *point) const
#define Z2_THROW_OUTSIDE_ERROR(env)
Throw an error returned from outside the Zoltan2 library.
#define ZOLTAN2_ALGMULTIJAGGED_SWAP(a, b, temp)
void set_partitioning_parameters(bool distribute_points_on_cut_lines_, int max_concurrent_part_calculation_, int check_migrate_avoid_migration_option_, double minimum_migration_imbalance_, int migration_type_=0)
Multi Jagged coordinate partitioning algorithm.
A gathering of useful namespace methods.
void uqsort(IT n, uSortItem< IT, WT > *arr)
Quick sort function. Sorts the arr of uSortItems, with respect to increasing vals.
Zoltan2_AlgMJ(const RCP< const Environment > &env, RCP< const Comm< int > > &problemComm, const RCP< const coordinateModel_t > &coords)
Contains Teuchos redcution operators for the Multi-jagged algorthm.
Multi Jagged coordinate partitioning algorithm.
static void getValidParameters(ParameterList &pl)
Set up validators specific to this algorithm.