49 #ifndef _ZOLTAN2_ALGMultiJagged_HPP_
50 #define _ZOLTAN2_ALGMultiJagged_HPP_
59 #include <Tpetra_Distributor.hpp>
60 #include <Teuchos_StandardParameterEntryValidators.hpp>
61 #include <Teuchos_ParameterList.hpp>
62 #include <Kokkos_Sort.hpp>
66 #include <unordered_map>
68 #ifdef ZOLTAN2_USEZOLTANCOMM
69 #ifdef HAVE_ZOLTAN2_MPI
70 #define ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
71 #include "zoltan_comm_cpp.h"
72 #include "zoltan_types.h"
81 template <
typename Ordinal,
typename T>
92 epsilon(std::numeric_limits<T>::epsilon()) {}
98 size(s_), epsilon(std::numeric_limits<T>::epsilon()) {}
105 void reduce(
const Ordinal count,
const T inBuffer[], T inoutBuffer[])
const {
106 for(Ordinal i = 0; i < count; i++) {
107 if(
Z2_ABS(inBuffer[i]) > epsilon) {
108 inoutBuffer[i] = inBuffer[i];
124 template <
typename IT,
typename CT,
typename WT>
144 this->
index = index_;
145 this->
count = count_;
153 void set(IT index_ ,CT count_, WT *vals_) {
154 this->
index = index_;
155 this->
count = count_;
159 bool operator<(const uMultiSortItem<IT,CT,WT>& other)
const {
160 assert(this->
count == other.count);
161 for(CT i = 0; i < this->
count; ++i) {
163 if(std::abs(this->
val[i] - other.val[i]) < this->epsilon) {
167 if(this->
val[i] < other.val[i]) {
176 return this->
index < other.index;
182 template <
class IT,
class WT>
193 template <
class IT,
class WT>
195 const int NSTACK = 50;
197 IT i, ir=n, j, k, l=1;
198 IT jstack=0, istack[NSTACK];
205 for(j=l+1;j<=ir;j++) {
208 for(i=j-1;i>=1;i--) {
209 if(arr[i].val <= aval)
222 std::swap(arr[k],arr[l+1]);
223 if(arr[l+1].val > arr[ir].val) {
224 std::swap(arr[l+1],arr[ir]);
226 if(arr[l].val > arr[ir].val) {
227 std::swap(arr[l],arr[ir]);
229 if(arr[l+1].val > arr[l].val) {
230 std::swap(arr[l+1],arr[l]);
237 do i++;
while (arr[i].val < aval);
238 do j--;
while (arr[j].val > aval);
240 std::swap(arr[i],arr[j]);
245 if(jstack > NSTACK) {
246 std::cout <<
"uqsort: NSTACK too small in sort." << std::endl;
263 template <
class IT,
class WT,
class SIGN>
269 bool operator<(const uSignedSortItem<IT, WT, SIGN>& rhs)
const {
271 if(this->
signbit < rhs.signbit) {
275 else if(this->
signbit == rhs.signbit) {
276 if(this->
val < rhs.val) {
280 else if(this->
val > rhs.val) {
294 bool operator<=(const uSignedSortItem<IT, WT, SIGN>& rhs) {
295 return (this->
val == rhs.val && this->signbit == rhs.signbit) || (*
this < rhs);
302 template <
class IT,
class WT,
class SIGN>
304 const IT NSTACK = 50;
306 IT i, ir=n, j, k, l=1;
307 IT jstack=0, istack[NSTACK];
313 for(j=l+1;j<=ir;j++) {
315 for(i=j-1;i>=1;i--) {
331 std::swap(arr[k],arr[l+1]);
332 if(arr[ir] < arr[l+1]) {
333 std::swap(arr[l+1],arr[ir]);
335 if(arr[ir] < arr[l] ) {
336 std::swap(arr[l],arr[ir]);
338 if(arr[l] < arr[l+1]) {
339 std::swap(arr[l+1],arr[l]);
345 do i++;
while (arr[i] < a);
346 do j--;
while (a < arr[j]);
348 std::swap(arr[i],arr[j]);
353 if(jstack > NSTACK) {
354 std::cout <<
"uqsort: NSTACK too small in sort." << std::endl;
390 static int counter = 0;
394 static int counter = 0;
401 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
402 typename mj_part_t,
typename mj_node_t>
406 typedef typename mj_node_t::device_type device_t;
408 typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
413 static constexpr
size_t future_reduceall_cutoff = 1500000;
417 static constexpr mj_lno_t min_work_last_dim = 1000;
419 static constexpr mj_scalar_t least_signifiance = 0.0001;
420 static constexpr
int significance_mul = 1000;
422 std::string mj_timer_base_string;
424 RCP<const Environment> mj_env;
425 RCP<const Comm<int> > mj_problemComm;
426 RCP<Comm<int> > comm;
427 double imbalance_tolerance;
430 int num_weights_per_coord;
431 size_t initial_num_loc_coords;
433 mj_lno_t num_local_coords;
434 mj_gno_t num_global_coords;
435 mj_scalar_t sEpsilon;
438 bool distribute_points_on_cut_lines;
441 mj_part_t max_concurrent_part_calculation;
444 int mj_user_recursion_depth;
445 bool mj_keep_part_boxes;
448 int check_migrate_avoid_migration_option;
455 double minimum_migration_imbalance;
472 mj_part_t num_first_level_parts;
476 Kokkos::View<mj_part_t*, Kokkos::HostSpace> first_level_distribution;
478 mj_part_t total_num_cut ;
479 mj_part_t total_num_part;
481 mj_part_t max_num_part_along_dim ;
482 mj_part_t max_num_cut_along_dim;
485 size_t max_num_total_part_along_dim;
487 mj_part_t total_dim_num_reduce_all;
491 mj_part_t last_dim_num_part;
494 Kokkos::View<mj_part_t *, Kokkos::HostSpace> part_no_array;
498 Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t>
502 Kokkos::View<mj_scalar_t **, device_t> mj_weights;
505 Kokkos::View<bool *, Kokkos::HostSpace> mj_uniform_parts;
508 Kokkos::View<bool *, Kokkos::HostSpace> mj_uniform_weights;
512 size_t num_global_parts;
515 RCP<mj_partBoxVector_t> kept_boxes;
517 RCP<mj_partBox_t> global_box;
522 bool divide_to_prime_first;
525 Kokkos::View<const mj_gno_t*, device_t> initial_mj_gnos;
528 Kokkos::View<mj_gno_t*, device_t> current_mj_gnos;
531 Kokkos::View<int*, Kokkos::HostSpace> owner_of_coordinate;
534 Kokkos::View<mj_lno_t*, device_t> coordinate_permutations;
537 Kokkos::View<mj_lno_t*, device_t> new_coordinate_permutations;
540 Kokkos::View<mj_part_t*, device_t> assigned_part_ids;
543 Kokkos::View<mj_lno_t *, device_t> part_xadj;
546 Kokkos::View<mj_lno_t *, device_t> new_part_xadj;
548 Kokkos::View<mj_scalar_t *, device_t> all_cut_coordinates;
551 Kokkos::View<mj_scalar_t *, device_t>
552 process_cut_line_weight_to_put_left;
555 Kokkos::View<mj_scalar_t *, device_t>
556 thread_cut_line_weight_to_put_left;
562 Kokkos::View<mj_scalar_t *, device_t> cut_coordinates_work_array;
565 Kokkos::View<mj_scalar_t *, device_t> temp_cut_coords;
568 Kokkos::View<mj_scalar_t *, device_t> target_part_weights;
571 Kokkos::View<mj_scalar_t *, device_t> cut_upper_bound_coordinates;
574 Kokkos::View<mj_scalar_t *, device_t> cut_lower_bound_coordinates;
577 Kokkos::View<mj_scalar_t *, device_t> cut_lower_bound_weights;
580 Kokkos::View<mj_scalar_t *, device_t> cut_upper_bound_weights;
584 Kokkos::View<mj_scalar_t *, device_t>
585 process_local_min_max_coord_total_weight;
588 Kokkos::View<mj_scalar_t *, device_t>
589 global_min_max_coord_total_weight;
593 Kokkos::View<bool *, device_t> is_cut_line_determined;
599 Kokkos::View<mj_part_t *, device_t> device_incomplete_cut_count;
600 typename decltype(device_incomplete_cut_count)::HostMirror
601 incomplete_cut_count;
604 typename decltype (part_xadj)::HostMirror host_part_xadj;
607 Kokkos::View<double *, device_t>
611 Kokkos::View<double *, device_t>
612 thread_part_weight_work;
616 Kokkos::View<mj_scalar_t *, device_t>
617 thread_cut_left_closest_point;
621 Kokkos::View<mj_scalar_t *, device_t>
622 thread_cut_right_closest_point;
625 Kokkos::View<mj_lno_t *, device_t>
628 Kokkos::View<mj_scalar_t *, device_t> process_rectilinear_cut_weight;
629 Kokkos::View<mj_scalar_t *, device_t> global_rectilinear_cut_weight;
635 Kokkos::View<mj_scalar_t *, device_t>
636 total_part_weight_left_right_closests;
637 Kokkos::View<mj_scalar_t *, device_t>
638 global_total_part_weight_left_right_closests;
640 Kokkos::View<mj_part_t*, device_t> device_num_partitioning_in_current_dim;
641 typename decltype(device_num_partitioning_in_current_dim)::HostMirror
642 host_num_partitioning_in_current_dim;
649 KOKKOS_INLINE_FUNCTION
650 double calculate_imbalance(mj_scalar_t achieved, mj_scalar_t expected) {
651 return static_cast<double>(achieved) / static_cast<double>(expected) - 1.0;
660 void set_part_specifications();
667 inline mj_part_t get_part_count(
668 mj_part_t num_total_future,
677 void init_part_boxes(RCP<mj_partBoxVector_t> & outPartBoxes);
698 mj_part_t update_part_num_arrays(
699 std::vector<mj_part_t> *future_num_part_in_parts,
700 std::vector<mj_part_t> *next_future_num_parts_in_parts,
701 mj_part_t &future_num_parts,
702 mj_part_t current_num_parts,
703 int current_iteration,
704 RCP<mj_partBoxVector_t> input_part_boxes,
705 RCP<mj_partBoxVector_t> output_part_boxes,
706 mj_part_t atomic_part_count);
720 KOKKOS_INLINE_FUNCTION
721 void mj_calculate_new_cut_position (
722 mj_scalar_t cut_upper_bound,
723 mj_scalar_t cut_lower_bound,
724 mj_scalar_t cut_upper_weight,
725 mj_scalar_t cut_lower_weight,
726 mj_scalar_t expected_weight,
727 mj_scalar_t &new_cut_position,
728 mj_scalar_t sEpsilon);
754 bool mj_perform_migration(
755 mj_part_t in_num_parts,
756 mj_part_t &out_num_parts,
757 std::vector<mj_part_t> *next_future_num_parts_in_parts,
758 mj_part_t &output_part_begin_index,
759 size_t migration_reduce_all_population,
760 mj_lno_t num_coords_for_last_dim_part,
761 std::string iteration,
762 RCP<mj_partBoxVector_t> &input_part_boxes,
763 RCP<mj_partBoxVector_t> &output_part_boxes);
782 bool mj_check_to_migrate(
783 size_t migration_reduce_all_population,
784 mj_lno_t num_coords_for_last_dim_part,
787 mj_gno_t *num_points_in_all_processor_parts);
813 void mj_migration_part_proc_assignment(
814 mj_gno_t * num_points_in_all_processor_parts,
817 mj_lno_t *send_count_to_each_proc,
818 std::vector<mj_part_t> &processor_ranks_for_subcomm,
819 std::vector<mj_part_t> *next_future_num_parts_in_parts,
820 mj_part_t &out_num_part,
821 std::vector<mj_part_t> &out_part_indices,
822 mj_part_t &output_part_numbering_begin_index,
823 int *coordinate_destinations);
850 void mj_assign_proc_to_parts(
851 mj_gno_t * num_points_in_all_processor_parts,
854 mj_lno_t *send_count_to_each_proc,
855 std::vector<mj_part_t> &processor_ranks_for_subcomm,
856 std::vector<mj_part_t> *next_future_num_parts_in_parts,
857 mj_part_t &out_part_index,
858 mj_part_t &output_part_numbering_begin_index,
859 int *coordinate_destinations);
876 void assign_send_destinations(
878 mj_part_t *part_assignment_proc_begin_indices,
879 mj_part_t *processor_chains_in_parts,
880 mj_lno_t *send_count_to_each_proc,
881 int *coordinate_destinations);
897 void assign_send_destinations2(
900 int *coordinate_destinations,
901 mj_part_t &output_part_numbering_begin_index,
902 std::vector<mj_part_t> *next_future_num_parts_in_parts);
926 void mj_assign_parts_to_procs(
927 mj_gno_t * num_points_in_all_processor_parts,
930 mj_lno_t *send_count_to_each_proc,
931 std::vector<mj_part_t> *next_future_num_parts_in_parts,
932 mj_part_t &out_num_part,
933 std::vector<mj_part_t> &out_part_indices,
934 mj_part_t &output_part_numbering_begin_index,
935 int *coordinate_destinations);
950 void mj_migrate_coords(
952 mj_lno_t &num_new_local_points,
953 std::string iteration,
954 int *coordinate_destinations,
955 mj_part_t num_parts);
962 void create_sub_communicator(
963 std::vector<mj_part_t> &processor_ranks_for_subcomm);
969 mj_part_t find_largest_prime_factor(mj_part_t num_parts) {
970 mj_part_t largest_factor = 1;
971 mj_part_t n = num_parts;
972 mj_part_t divisor = 2;
974 while (n % divisor == 0) {
976 largest_factor = divisor;
979 if(divisor * divisor > n) {
986 return largest_factor;
1019 const RCP<const Environment> &env,
1020 RCP<
const Comm<int> > &problemComm,
1021 double imbalance_tolerance,
1023 size_t num_global_parts,
1024 Kokkos::View<mj_part_t*, Kokkos::HostSpace> & part_no_array,
1025 int recursion_depth,
1027 mj_lno_t num_local_coords,
1028 mj_gno_t num_global_coords,
1029 Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos,
1031 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> & mj_coordinates,
1032 int num_weights_per_coord,
1033 Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_weights,
1034 Kokkos::View<mj_scalar_t**, device_t> & mj_weights,
1035 Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_parts,
1036 Kokkos::View<mj_part_t*, device_t> & result_assigned_part_ids,
1037 Kokkos::View<mj_gno_t*, device_t> & result_mj_gnos);
1052 bool distribute_points_on_cut_lines_,
1053 int max_concurrent_part_calculation_,
1054 int check_migrate_avoid_migration_option_,
1055 double minimum_migration_imbalance_,
1056 int migration_type_ = 0);
1073 RCP<mj_partBoxVector_t> &localPartBoxes)
const;
1114 const RCP<const Environment> &env,
1115 mj_lno_t num_total_coords,
1116 mj_lno_t num_selected_coords,
1117 size_t num_target_part,
1120 Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t> & mj_coordinates_,
1121 Kokkos::View<mj_lno_t *, device_t> &
1122 initial_selected_coords_output_permutation,
1123 mj_lno_t *output_xadj,
1124 int recursion_depth_,
1125 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & part_no_array,
1126 bool partition_along_longest_dim,
1127 int num_ranks_per_node,
1128 bool divide_to_prime_first_,
1129 mj_part_t num_first_level_parts_ = 1,
1130 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & first_level_distribution_
1131 = Kokkos::View<mj_part_t *, Kokkos::HostSpace>());
1133 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
1141 void allocate_set_work_memory();
1144 void compute_global_box();
1153 void mj_get_local_min_max_coord_totW(
1154 mj_part_t current_work_part,
1155 mj_part_t current_concurrent_num_parts,
1156 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords);
1170 void mj_get_global_min_max_coord_totW(
1171 mj_part_t current_concurrent_num_parts,
1172 Kokkos::View<mj_scalar_t *, device_t> & local_min_max_total,
1173 Kokkos::View<mj_scalar_t *, device_t> & global_min_max_total);
1205 void mj_get_initial_cut_coords_target_weights(
1206 mj_scalar_t min_coord,
1207 mj_scalar_t max_coord,
1208 mj_part_t num_cuts ,
1209 mj_scalar_t global_weight,
1210 Kokkos::View<mj_scalar_t *, device_t> & initial_cut_coords,
1211 Kokkos::View<mj_scalar_t *, device_t> & target_part_weights,
1212 std::vector <mj_part_t> *future_num_part_in_parts,
1213 std::vector <mj_part_t> *next_future_num_parts_in_parts,
1214 mj_part_t concurrent_current_part,
1215 mj_part_t obtained_part_index,
1216 mj_part_t num_target_first_level_parts = 1,
1217 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & target_first_level_dist =
1218 Kokkos::View<mj_part_t *, Kokkos::HostSpace>());
1236 void set_initial_coordinate_parts(
1237 mj_scalar_t &max_coordinate,
1238 mj_scalar_t &min_coordinate,
1239 mj_lno_t coordinate_begin_index,
1240 mj_lno_t coordinate_end_index,
1241 Kokkos::View<mj_lno_t *, device_t> &
1242 mj_current_coordinate_permutations,
1243 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1244 Kokkos::View<mj_part_t *, device_t> & mj_part_ids,
1245 mj_part_t &partition_count);
1264 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1265 double imbalanceTolerance,
1266 mj_part_t current_work_part,
1267 mj_part_t current_concurrent_num_parts,
1268 Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
1269 mj_part_t total_incomplete_cut_count,
1270 Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count,
1271 Kokkos::View<size_t*, device_t> & view_total_reduction_size);
1278 void mj_1D_part_get_part_weights(
1279 mj_part_t current_concurrent_num_parts,
1280 mj_part_t current_work_part,
1281 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1291 void mj_combine_rightleft_and_weights(
1292 mj_part_t current_work_part,
1293 mj_part_t current_concurrent_num_parts);
1307 void mj_create_new_partitions(
1308 mj_part_t num_parts,
1309 mj_part_t current_concurrent_work_part,
1310 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1311 Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
1312 Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
1313 Kokkos::View<mj_lno_t *, device_t> & out_part_xadj);
1350 void mj_get_new_cut_coordinates(
1351 mj_part_t current_concurrent_num_parts,
1353 const mj_part_t &num_cuts,
1354 const double &used_imbalance_tolerance,
1355 Kokkos::View<mj_scalar_t *, device_t> & current_global_part_weights,
1356 Kokkos::View<mj_scalar_t *, device_t> & current_local_part_weights,
1357 Kokkos::View<mj_scalar_t *, device_t> & current_part_target_weights,
1358 Kokkos::View<bool *, device_t> & current_cut_line_determined,
1359 Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
1360 Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_bounds,
1361 Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bounds,
1362 Kokkos::View<mj_scalar_t *, device_t> & current_global_left_closest_points,
1363 Kokkos::View<mj_scalar_t *, device_t> & current_global_right_closest_points,
1364 Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bound_weights,
1365 Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_weights,
1366 Kokkos::View<mj_scalar_t *, device_t> & new_current_cut_coordinates,
1367 Kokkos::View<mj_scalar_t *, device_t> &
1368 current_part_cut_line_weight_to_put_left,
1369 Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count);
1380 void get_processor_num_points_in_parts(
1381 mj_part_t num_procs,
1382 mj_part_t num_parts,
1383 mj_gno_t *&num_points_in_all_processor_parts);
1389 void fill_permutation_array(
1390 mj_part_t output_num_parts,
1391 mj_part_t num_parts);
1414 void create_consistent_chunks(
1415 mj_part_t num_parts,
1416 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1417 Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
1418 mj_lno_t coordinate_begin,
1419 mj_lno_t coordinate_end,
1420 Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
1421 Kokkos::View<mj_lno_t *, device_t> & out_part_xadj,
1423 bool longest_dim_part,
1434 void set_final_parts(
1435 mj_part_t current_num_parts,
1436 mj_part_t output_part_begin_index,
1437 RCP<mj_partBoxVector_t> &output_part_boxes,
1438 bool is_data_ever_migrated);
1443 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
1444 typename mj_part_t,
typename mj_node_t>
1446 mj_env(), mj_problemComm(), comm(), imbalance_tolerance(0),
1447 recursion_depth(0), coord_dim(0),
1448 num_weights_per_coord(0), initial_num_loc_coords(0),
1449 initial_num_glob_coords(0),
1450 num_local_coords(0), num_global_coords(0),
1451 sEpsilon(std::numeric_limits<mj_scalar_t>::
epsilon() * 100),
1452 distribute_points_on_cut_lines(true),
1453 max_concurrent_part_calculation(1),
1454 mj_run_as_rcb(false), mj_user_recursion_depth(0),
1455 mj_keep_part_boxes(false),
1456 check_migrate_avoid_migration_option(0), migration_type(0),
1457 minimum_migration_imbalance(0.30),
1458 num_first_level_parts(1),
1459 total_num_cut(0), total_num_part(0), max_num_part_along_dim(0),
1460 max_num_cut_along_dim(0),
1461 max_num_total_part_along_dim(0),
1462 total_dim_num_reduce_all(0),
1463 last_dim_num_part(0),
1465 num_global_parts(1),
1466 kept_boxes(), global_box(),
1467 myRank(0), myActualRank(0),
1468 divide_to_prime_first(false)
1515 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
1516 typename mj_part_t,
typename mj_node_t>
1519 const RCP<const Environment> &env,
1520 mj_lno_t num_total_coords,
1521 mj_lno_t num_selected_coords,
1522 size_t num_target_part,
1525 Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t> &
1527 Kokkos::View<mj_lno_t *, device_t> & initial_adjList_output_adjlist,
1528 mj_lno_t *output_xadj,
1529 int recursion_depth_,
1530 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & part_no_array_,
1531 bool partition_along_longest_dim,
1532 int num_ranks_per_node,
1533 bool divide_to_prime_first_,
1534 mj_part_t num_first_level_parts_,
1535 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & first_level_distribution_)
1538 const RCP<Comm<int> > commN;
1539 this->mj_problemComm = Teuchos::DefaultComm<int>::getDefaultSerialComm(commN);
1540 this->comm = Teuchos::rcp_const_cast<Comm<int> >(this->mj_problemComm);
1541 this->myActualRank = this->myRank = 1;
1543 this->divide_to_prime_first = divide_to_prime_first_;
1548 this->imbalance_tolerance = 0;
1549 this->num_global_parts = num_target_part;
1550 this->part_no_array = part_no_array_;
1551 this->recursion_depth = recursion_depth_;
1555 this->num_first_level_parts = num_first_level_parts_;
1557 this->first_level_distribution = first_level_distribution_;
1559 this->coord_dim = coord_dim_;
1560 this->num_local_coords = num_total_coords;
1562 this->num_global_coords = num_total_coords;
1563 this->mj_coordinates = mj_coordinates_;
1566 this->initial_mj_gnos =
1567 Kokkos::View<mj_gno_t*, device_t>(
"gids", this->num_local_coords);
1569 this->num_weights_per_coord = 0;
1571 this->mj_uniform_weights = Kokkos::View<bool*, Kokkos::HostSpace>(
1572 "uniform weights", 1);
1573 this->mj_uniform_weights(0) =
true;
1575 this->mj_weights = Kokkos::View<mj_scalar_t**, device_t>
1578 this->mj_uniform_parts =
1579 Kokkos::View<bool*, Kokkos::HostSpace>(
"uniform parts", 1);
1580 this->mj_uniform_parts(0) =
true;
1582 this->set_part_specifications();
1584 this->allocate_set_work_memory();
1587 auto local_part_xadj = this->part_xadj;
1588 Kokkos::parallel_for(
1589 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
1590 KOKKOS_LAMBDA (
int dummy) {
1591 local_part_xadj(0) =
static_cast<mj_lno_t
>(num_selected_coords);
1594 Kokkos::deep_copy(coordinate_permutations, initial_adjList_output_adjlist);
1596 mj_part_t current_num_parts = 1;
1598 Kokkos::View<mj_scalar_t *, device_t> current_cut_coordinates =
1599 this->all_cut_coordinates;
1601 mj_part_t future_num_parts = this->total_num_part;
1603 std::vector<mj_part_t> *future_num_part_in_parts =
1604 new std::vector<mj_part_t>();
1605 std::vector<mj_part_t> *next_future_num_parts_in_parts =
1606 new std::vector<mj_part_t>();
1607 next_future_num_parts_in_parts->push_back(this->num_global_parts);
1608 RCP<mj_partBoxVector_t> t1;
1609 RCP<mj_partBoxVector_t> t2;
1611 std::vector <uSignedSortItem<int, mj_scalar_t, char>>
1612 coord_dimension_range_sorted(this->coord_dim);
1614 &(coord_dimension_range_sorted[0]);
1615 std::vector <mj_scalar_t> coord_dim_mins(this->coord_dim);
1616 std::vector <mj_scalar_t> coord_dim_maxs(this->coord_dim);
1620 Kokkos::View<mj_part_t*, device_t>
1621 view_rectilinear_cut_count(
"view_rectilinear_cut_count", 1);
1622 Kokkos::View<size_t*, device_t>
1623 view_total_reduction_size(
"view_total_reduction_size", 1);
1625 for(
int rd = 0; rd < this->recursion_depth; ++rd) {
1631 std::vector<mj_part_t> *tmpPartVect = future_num_part_in_parts;
1632 future_num_part_in_parts = next_future_num_parts_in_parts;
1633 next_future_num_parts_in_parts = tmpPartVect;
1637 next_future_num_parts_in_parts->clear();
1640 mj_part_t output_part_count_in_dimension =
1641 this->update_part_num_arrays(
1642 future_num_part_in_parts,
1643 next_future_num_parts_in_parts,
1648 t2, num_ranks_per_node);
1653 if(output_part_count_in_dimension == current_num_parts) {
1654 tmpPartVect = future_num_part_in_parts;
1655 future_num_part_in_parts = next_future_num_parts_in_parts;
1656 next_future_num_parts_in_parts = tmpPartVect;
1661 std::string istring = std::to_string(rd);
1665 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
1666 "new part xadj", output_part_count_in_dimension);
1670 mj_part_t output_part_index = 0;
1674 mj_part_t output_coordinate_end_index = 0;
1676 mj_part_t current_work_part = 0;
1677 mj_part_t current_concurrent_num_parts = 1;
1679 mj_part_t obtained_part_index = 0;
1682 int coordInd = rd % this->coord_dim;
1684 Kokkos::View<mj_scalar_t *, device_t> mj_current_dim_coords =
1685 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
1687 auto host_process_local_min_max_coord_total_weight =
1688 Kokkos::create_mirror_view(process_local_min_max_coord_total_weight);
1689 auto host_global_min_max_coord_total_weight =
1690 Kokkos::create_mirror_view(global_min_max_coord_total_weight);
1693 for(; current_work_part < current_num_parts;
1694 current_work_part += current_concurrent_num_parts) {
1696 mj_part_t actual_work_part_count = 0;
1701 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk) {
1702 mj_part_t current_work_part_in_concurrent_parts =
1703 current_work_part + kk;
1707 mj_part_t partition_count = host_num_partitioning_in_current_dim(
1708 current_work_part_in_concurrent_parts);
1709 if(partition_count == 1) {
1712 ++actual_work_part_count;
1713 if(partition_along_longest_dim) {
1714 auto local_process_local_min_max_coord_total_weight =
1715 this->process_local_min_max_coord_total_weight;
1716 for(
int coord_traverse_ind = 0;
1717 coord_traverse_ind < this->coord_dim; ++coord_traverse_ind) {
1719 Kokkos::View<mj_scalar_t *, device_t> coords =
1720 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coord_traverse_ind);
1722 this->mj_get_local_min_max_coord_totW(
1724 current_concurrent_num_parts,
1727 coord_dimension_range_sorted[coord_traverse_ind].id =
1729 coord_dimension_range_sorted[coord_traverse_ind].signbit = 1;
1731 Kokkos::deep_copy(host_process_local_min_max_coord_total_weight,
1732 process_local_min_max_coord_total_weight);
1734 coord_dim_mins[coord_traverse_ind] =
1735 host_process_local_min_max_coord_total_weight(kk);
1736 coord_dim_maxs[coord_traverse_ind] =
1737 host_process_local_min_max_coord_total_weight(
1738 kk + current_concurrent_num_parts);
1739 coord_dimension_range_sorted[coord_traverse_ind].val =
1740 host_process_local_min_max_coord_total_weight(
1741 kk + current_concurrent_num_parts) -
1742 host_process_local_min_max_coord_total_weight(kk);
1745 uqSignsort(this->coord_dim, p_coord_dimension_range_sorted);
1746 coordInd = p_coord_dimension_range_sorted[this->coord_dim - 1].
id;
1747 auto set_min = coord_dim_mins[coordInd];
1748 auto set_max = coord_dim_maxs[coordInd];
1749 Kokkos::parallel_for(
1750 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
1751 (0, 1), KOKKOS_LAMBDA (
int dummy) {
1752 local_process_local_min_max_coord_total_weight(kk) = set_min;
1753 local_process_local_min_max_coord_total_weight(
1754 kk + current_concurrent_num_parts) = set_max;
1757 mj_current_dim_coords =
1758 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
1761 Kokkos::View<mj_scalar_t *, device_t> coords =
1762 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
1763 this->mj_get_local_min_max_coord_totW(
1765 current_concurrent_num_parts,
1771 if(actual_work_part_count > 0) {
1773 this->mj_get_global_min_max_coord_totW(
1774 current_concurrent_num_parts,
1775 this->process_local_min_max_coord_total_weight,
1776 this->global_min_max_coord_total_weight);
1779 Kokkos::deep_copy(host_global_min_max_coord_total_weight,
1780 global_min_max_coord_total_weight);
1784 mj_part_t total_incomplete_cut_count = 0;
1789 mj_part_t concurrent_part_cut_shift = 0;
1790 mj_part_t concurrent_part_part_shift = 0;
1791 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk) {
1792 mj_scalar_t min_coordinate =
1793 host_global_min_max_coord_total_weight(kk);
1794 mj_scalar_t max_coordinate = host_global_min_max_coord_total_weight(
1795 kk + current_concurrent_num_parts);
1796 mj_scalar_t global_total_weight = host_global_min_max_coord_total_weight(
1797 kk + 2*current_concurrent_num_parts);
1799 mj_part_t concurrent_current_part_index = current_work_part + kk;
1801 mj_part_t partition_count = host_num_partitioning_in_current_dim(
1802 concurrent_current_part_index);
1804 Kokkos::View<mj_scalar_t *, device_t> usedCutCoordinate =
1805 Kokkos::subview(current_cut_coordinates,
1806 std::pair<mj_lno_t, mj_lno_t>(
1807 concurrent_part_cut_shift,
1808 current_cut_coordinates.size()));
1809 Kokkos::View<mj_scalar_t *, device_t>
1810 current_target_part_weights =
1811 Kokkos::subview(target_part_weights,
1812 std::pair<mj_lno_t, mj_lno_t>(
1813 concurrent_part_part_shift,
1814 target_part_weights.size()));
1817 concurrent_part_cut_shift += partition_count - 1;
1819 concurrent_part_part_shift += partition_count;
1822 if(partition_count > 1 && min_coordinate <= max_coordinate) {
1825 total_incomplete_cut_count += partition_count - 1;
1827 this->incomplete_cut_count(kk) = partition_count - 1;
1835 this->mj_get_initial_cut_coords_target_weights(
1838 partition_count - 1,
1839 global_total_weight,
1841 current_target_part_weights,
1842 future_num_part_in_parts,
1843 next_future_num_parts_in_parts,
1844 concurrent_current_part_index,
1845 obtained_part_index,
1846 rd == 0 ? this->num_first_level_parts : 1,
1847 this->first_level_distribution);
1849 mj_lno_t coordinate_end_index =
1850 host_part_xadj(concurrent_current_part_index);
1851 mj_lno_t coordinate_begin_index =
1852 (concurrent_current_part_index==0) ? 0 :
1853 host_part_xadj[concurrent_current_part_index - 1];
1856 this->set_initial_coordinate_parts(
1859 coordinate_begin_index, coordinate_end_index,
1860 this->coordinate_permutations,
1861 mj_current_dim_coords,
1862 this->assigned_part_ids,
1868 this->incomplete_cut_count(kk) = 0;
1870 obtained_part_index += partition_count;
1875 double used_imbalance = 0;
1879 mj_timer_base_string +
"mj_1D_part()");
1882 mj_current_dim_coords,
1885 current_concurrent_num_parts,
1886 current_cut_coordinates,
1887 total_incomplete_cut_count,
1888 view_rectilinear_cut_count,
1889 view_total_reduction_size);
1892 mj_timer_base_string +
"mj_1D_part()");
1895 obtained_part_index += current_concurrent_num_parts;
1899 mj_part_t output_array_shift = 0;
1900 mj_part_t cut_shift = 0;
1901 size_t tlr_shift = 0;
1902 size_t partweight_array_shift = 0;
1904 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk) {
1905 mj_part_t current_concurrent_work_part = current_work_part + kk;
1907 mj_part_t num_parts = host_num_partitioning_in_current_dim(
1908 current_concurrent_work_part);
1911 int coordinateA_bigger_than_coordinateB =
1912 host_global_min_max_coord_total_weight(kk) >
1913 host_global_min_max_coord_total_weight(
1914 kk + current_concurrent_num_parts);
1916 if((num_parts != 1) && coordinateA_bigger_than_coordinateB) {
1919 auto local_new_part_xadj = this->new_part_xadj;
1920 Kokkos::parallel_for(
1921 Kokkos::RangePolicy<
typename mj_node_t::execution_space,
1922 mj_part_t> (0, num_parts), KOKKOS_LAMBDA(mj_part_t jj) {
1923 local_new_part_xadj(
1924 output_part_index + output_array_shift + jj) = 0;
1927 cut_shift += num_parts - 1;
1928 tlr_shift += (4 *(num_parts - 1) + 1);
1929 output_array_shift += num_parts;
1930 partweight_array_shift += (2 * (num_parts - 1) + 1);
1933 mj_lno_t coordinate_end =
1934 host_part_xadj(current_concurrent_work_part);
1935 mj_lno_t coordinate_begin =
1936 current_concurrent_work_part==0 ? 0 :
1937 host_part_xadj(current_concurrent_work_part-1);
1939 Kokkos::View<mj_scalar_t *, device_t>
1940 current_concurrent_cut_coordinate =
1941 Kokkos::subview(current_cut_coordinates,
1942 std::pair<mj_lno_t, mj_lno_t>(
1944 current_cut_coordinates.size()));
1945 Kokkos::View<mj_scalar_t *, device_t>
1946 used_local_cut_line_weight_to_left =
1947 Kokkos::subview(process_cut_line_weight_to_put_left,
1948 std::pair<mj_lno_t, mj_lno_t>(
1950 process_cut_line_weight_to_put_left.size()));
1952 this->thread_part_weight_work =
1954 this->thread_part_weights,
1955 std::pair<mj_lno_t, mj_lno_t>(
1956 partweight_array_shift,
1957 this->thread_part_weights.size()));
1961 Kokkos::View<mj_lno_t *, device_t> subview_new_part_xadj =
1962 Kokkos::subview(this->new_part_xadj,
1963 std::pair<mj_lno_t, mj_lno_t>(
1964 output_part_index + output_array_shift,
1965 this->new_part_xadj.size()));
1967 this->create_consistent_chunks(
1969 mj_current_dim_coords,
1970 current_concurrent_cut_coordinate,
1973 used_local_cut_line_weight_to_left,
1974 subview_new_part_xadj,
1976 partition_along_longest_dim,
1977 p_coord_dimension_range_sorted);
1982 mj_lno_t part_size = coordinate_end - coordinate_begin;
1984 auto local_new_part_xadj = this->new_part_xadj;
1985 Kokkos::parallel_for(
1986 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
1987 (0, 1), KOKKOS_LAMBDA (
int dummy) {
1988 local_new_part_xadj(output_part_index + output_array_shift)
1992 auto subview_new_coordinate_permutations =
1993 Kokkos::subview(this->new_coordinate_permutations,
1994 std::pair<mj_lno_t, mj_lno_t>(
1996 coordinate_begin + part_size));
1997 auto subview_coordinate_permutations =
1998 Kokkos::subview(this->coordinate_permutations,
1999 std::pair<mj_lno_t, mj_lno_t>(
2001 coordinate_begin + part_size));
2002 Kokkos::deep_copy(subview_new_coordinate_permutations,
2003 subview_coordinate_permutations);
2006 cut_shift += num_parts - 1;
2007 tlr_shift += (4 *(num_parts - 1) + 1);
2008 output_array_shift += num_parts;
2009 partweight_array_shift += (2 * (num_parts - 1) + 1);
2018 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
2019 mj_part_t num_parts =
2020 host_num_partitioning_in_current_dim(current_work_part + kk);
2021 auto local_new_part_xadj = this->new_part_xadj;
2022 auto local_mj_current_dim_coords = mj_current_dim_coords;
2023 auto local_new_coordinate_permutations =
2024 new_coordinate_permutations;
2025 Kokkos::parallel_for(
2026 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t> (
2027 0, num_parts), KOKKOS_LAMBDA (mj_part_t ii) {
2029 local_new_part_xadj(output_part_index+ii) +=
2030 output_coordinate_end_index;
2033 mj_lno_t coordinate_end =
2034 local_new_part_xadj(output_part_index+ii);
2035 mj_lno_t coordinate_begin =
2036 local_new_part_xadj(output_part_index);
2038 for(mj_lno_t task_traverse = coordinate_begin;
2039 task_traverse < coordinate_end; ++task_traverse) {
2040 mj_lno_t l = local_new_coordinate_permutations(task_traverse);
2042 local_mj_current_dim_coords(l) = -local_mj_current_dim_coords(l);
2048 mj_part_t get_single;
2049 Kokkos::parallel_reduce(
"Read new_part_xadj",
2050 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>(0, 1),
2051 KOKKOS_LAMBDA(
int dummy, mj_part_t & set_single) {
2052 set_single = local_new_part_xadj(output_part_index + num_parts - 1);
2055 output_coordinate_end_index = get_single;
2057 output_part_index += num_parts;
2064 current_num_parts = output_part_count_in_dimension;
2067 Kokkos::View<mj_lno_t *, device_t> tmp = this->coordinate_permutations;
2068 this->coordinate_permutations = this->new_coordinate_permutations;
2069 this->new_coordinate_permutations = tmp;
2071 this->part_xadj = this->new_part_xadj;
2072 this->host_part_xadj = Kokkos::create_mirror_view(part_xadj);
2073 Kokkos::deep_copy(host_part_xadj, part_xadj);
2074 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
"empty", 0);
2077 Kokkos::deep_copy(initial_adjList_output_adjlist, coordinate_permutations);
2081 for(
size_t i = 0; i < this->num_global_parts ; ++i) {
2082 output_xadj[i+1] = host_part_xadj(i);
2085 delete future_num_part_in_parts;
2086 delete next_future_num_parts_in_parts;
2092 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2093 typename mj_part_t,
typename mj_node_t>
2095 <mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t,mj_node_t>::mj_partBox_t>
2099 return this->global_box;
2104 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2105 typename mj_part_t,
typename mj_node_t>
2106 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
2107 mj_node_t>::set_to_keep_part_boxes()
2109 this->mj_keep_part_boxes =
true;
2119 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2120 typename mj_part_t,
typename mj_node_t>
2124 this->total_num_cut = 0;
2125 this->total_num_part = 1;
2126 this->max_num_part_along_dim = 0;
2127 this->total_dim_num_reduce_all = 0;
2128 this->last_dim_num_part = 1;
2131 this->max_num_cut_along_dim = 0;
2132 this->max_num_total_part_along_dim = 0;
2134 if(this->part_no_array.size()) {
2135 auto local_recursion_depth = this->recursion_depth;
2137 this->total_dim_num_reduce_all =
2138 this->total_num_part * this->recursion_depth;
2140 this->total_num_part = 1;
2141 for(
int i = 0; i < local_recursion_depth; ++i) {
2142 this->total_num_part *= this->part_no_array(i);
2145 mj_part_t track_max = 0;
2146 for(
int i = 0; i < local_recursion_depth; ++i) {
2147 if(part_no_array(i) > track_max) {
2148 track_max = this->part_no_array(i);
2152 this->last_dim_num_part = this->total_num_part /
2153 this->part_no_array(local_recursion_depth-1);
2155 this->max_num_part_along_dim = track_max;
2156 this->num_global_parts = this->total_num_part;
2158 mj_part_t future_num_parts = this->num_global_parts;
2162 if (this->first_level_distribution.size() != 0 &&
2163 this->num_first_level_parts > 1) {
2164 this->max_num_part_along_dim = this->num_first_level_parts;
2169 for(
int rd = 0; rd < this->recursion_depth; ++rd) {
2170 mj_part_t maxNoPartAlongI = 0;
2171 mj_part_t nfutureNumParts = 0;
2177 this->first_level_distribution.size() != 0 &&
2178 this->num_first_level_parts > 1) {
2180 maxNoPartAlongI = this->num_first_level_parts;
2181 this->max_num_part_along_dim = this->num_first_level_parts;
2183 mj_part_t sum_first_level_dist = 0;
2184 mj_part_t max_part = 0;
2187 for (
int i = 0; i < this->num_first_level_parts; ++i) {
2188 sum_first_level_dist += this->first_level_distribution(i);
2189 if (this->first_level_distribution(i) > max_part)
2190 max_part = this->first_level_distribution(i);
2196 this->num_global_parts * max_part / sum_first_level_dist;
2200 maxNoPartAlongI = this->get_part_count(future_num_parts,
2201 1.0f / (this->recursion_depth - rd));
2202 if (maxNoPartAlongI > this->max_num_part_along_dim)
2203 this->max_num_part_along_dim = maxNoPartAlongI;
2204 nfutureNumParts = future_num_parts / maxNoPartAlongI;
2205 if (future_num_parts % maxNoPartAlongI) {
2209 future_num_parts = nfutureNumParts;
2211 this->total_num_part = this->num_global_parts;
2213 if(this->divide_to_prime_first) {
2214 this->total_dim_num_reduce_all = this->num_global_parts * 2;
2215 this->last_dim_num_part = this->num_global_parts;
2222 for(
int i = 0; i < this->recursion_depth; ++i) {
2223 this->total_dim_num_reduce_all += p;
2224 p *= this->max_num_part_along_dim;
2227 if(p / this->max_num_part_along_dim > this->num_global_parts) {
2228 this->last_dim_num_part = this->num_global_parts;
2231 this->last_dim_num_part = p / this->max_num_part_along_dim;
2236 this->total_num_cut = this->total_num_part - 1;
2237 this->max_num_cut_along_dim = this->max_num_part_along_dim - 1;
2238 this->max_num_total_part_along_dim = this->max_num_part_along_dim +
2239 size_t(this->max_num_cut_along_dim);
2244 if(this->max_concurrent_part_calculation > this->last_dim_num_part) {
2245 if(this->mj_problemComm->getRank() == 0) {
2246 std::cerr <<
"Warning: Concurrent part count (" <<
2247 this->max_concurrent_part_calculation <<
2248 ") has been set bigger than maximum amount that can be used." <<
2249 " Setting to:" << this->last_dim_num_part <<
"." << std::endl;
2251 this->max_concurrent_part_calculation = this->last_dim_num_part;
2260 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2261 typename mj_part_t,
typename mj_node_t>
2262 inline mj_part_t AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
2263 get_part_count(mj_part_t num_total_future,
double root)
2265 double fp = pow(num_total_future, root);
2266 mj_part_t ip = mj_part_t(fp);
2294 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2295 typename mj_part_t,
typename mj_node_t>
2296 mj_part_t AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
2297 update_part_num_arrays(
2298 std::vector<mj_part_t> *future_num_part_in_parts,
2299 std::vector<mj_part_t> *next_future_num_parts_in_parts,
2300 mj_part_t &future_num_parts,
2301 mj_part_t current_num_parts,
2302 int current_iteration,
2303 RCP<mj_partBoxVector_t> input_part_boxes,
2304 RCP<mj_partBoxVector_t> output_part_boxes,
2305 mj_part_t atomic_part_count)
2307 std::vector<mj_part_t> num_partitioning_in_current_dim;
2310 mj_part_t output_num_parts = 0;
2311 if(this->part_no_array.size()) {
2315 mj_part_t current_part_no_array =
2316 this->part_no_array(current_iteration);
2318 if(current_part_no_array < 1) {
2319 std::cout <<
"Current recursive iteration: " << current_iteration <<
2320 " part_no_array[" << current_iteration <<
"] is given as:" <<
2321 current_part_no_array << std::endl;
2324 if(current_part_no_array == 1) {
2325 return current_num_parts;
2329 if (this->first_level_distribution.size() != 0 &&
2330 current_iteration == 0 &&
2331 current_part_no_array != this->num_first_level_parts) {
2332 std::cout <<
"Current recursive iteration: " << current_iteration
2333 <<
" part_no_array[" << current_iteration <<
"] is given as: " <<
2334 current_part_no_array <<
" and contradicts num_first_level_parts: " <<
2335 this->num_first_level_parts << std::endl;
2339 for(mj_part_t ii = 0; ii < current_num_parts; ++ii) {
2340 num_partitioning_in_current_dim.push_back(current_part_no_array);
2357 future_num_parts /= num_partitioning_in_current_dim[0];
2358 output_num_parts = current_num_parts *
2359 num_partitioning_in_current_dim[0];
2360 if(this->mj_keep_part_boxes) {
2361 for(mj_part_t k = 0; k < current_num_parts; ++k) {
2363 for(mj_part_t j = 0; j <
2364 num_partitioning_in_current_dim[0]; ++j) {
2365 output_part_boxes->push_back((*input_part_boxes)[k]);
2373 for(mj_part_t ii = 0; ii < output_num_parts; ++ii) {
2374 next_future_num_parts_in_parts->push_back(future_num_parts);
2384 future_num_parts = 1;
2387 for(mj_part_t ii = 0; ii < current_num_parts; ++ii) {
2389 mj_part_t future_num_parts_of_part_ii = (*future_num_part_in_parts)[ii];
2393 mj_part_t num_partitions_in_current_dim =
2394 this->get_part_count(future_num_parts_of_part_ii,
2395 1.0 / (this->recursion_depth - current_iteration)
2397 if(num_partitions_in_current_dim > this->max_num_part_along_dim) {
2398 std::cerr <<
"ERROR: maxPartNo calculation is wrong."
2399 " num_partitions_in_current_dim: "
2400 << num_partitions_in_current_dim <<
" this->max_num_part_along_dim: "
2401 << this->max_num_part_along_dim <<
2402 " this->recursion_depth: " << this->recursion_depth <<
2403 " current_iteration:" << current_iteration <<
2404 " future_num_parts_of_part_ii: " << future_num_parts_of_part_ii <<
2405 " might need to fix max part no calculation for "
2406 "largest_prime_first partitioning." <<
2418 if (current_iteration == 0 &&
2419 this->first_level_distribution.size() != 0 &&
2420 this->num_first_level_parts > 1) {
2423 num_partitioning_in_current_dim.push_back(this->num_first_level_parts);
2426 output_num_parts = this->num_first_level_parts;
2429 future_num_parts /= this->num_first_level_parts;
2431 mj_part_t max_part = 0;
2432 mj_part_t sum_first_level_dist = 0;
2436 for (
int i = 0; i < this->num_first_level_parts; ++i) {
2437 sum_first_level_dist += this->first_level_distribution(i);
2439 if (this->first_level_distribution(i) > max_part)
2440 max_part = this->first_level_distribution(i);
2444 future_num_parts = this->num_global_parts * max_part / sum_first_level_dist;
2448 for (
int i = 0; i < this->num_first_level_parts; ++i) {
2449 next_future_num_parts_in_parts->push_back(this->first_level_distribution(i) *
2450 this->num_global_parts / sum_first_level_dist);
2453 else if (this->divide_to_prime_first) {
2455 num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
2457 mj_part_t largest_prime_factor = num_partitions_in_current_dim;
2460 output_num_parts += num_partitions_in_current_dim;
2462 if (future_num_parts_of_part_ii == atomic_part_count ||
2463 future_num_parts_of_part_ii % atomic_part_count != 0) {
2464 atomic_part_count = 1;
2467 largest_prime_factor =
2468 this->find_largest_prime_factor(future_num_parts_of_part_ii / atomic_part_count);
2475 if (largest_prime_factor < num_partitions_in_current_dim) {
2476 largest_prime_factor = num_partitions_in_current_dim;
2479 mj_part_t ideal_num_future_parts_in_part =
2480 (future_num_parts_of_part_ii / atomic_part_count) / largest_prime_factor;
2482 mj_part_t ideal_prime_scale = largest_prime_factor / num_partitions_in_current_dim;
2490 for (mj_part_t iii = 0; iii < num_partitions_in_current_dim; ++iii) {
2492 mj_part_t my_ideal_primescale = ideal_prime_scale;
2494 if (iii < (largest_prime_factor) % num_partitions_in_current_dim) {
2495 ++my_ideal_primescale;
2498 mj_part_t num_future_parts_for_part_iii =
2499 ideal_num_future_parts_in_part * my_ideal_primescale;
2502 if (iii < (future_num_parts_of_part_ii / atomic_part_count) % largest_prime_factor) {
2504 ++num_future_parts_for_part_iii;
2507 next_future_num_parts_in_parts->push_back(num_future_parts_for_part_iii * atomic_part_count);
2510 if (this->mj_keep_part_boxes) {
2511 output_part_boxes->push_back((*input_part_boxes)[ii]);
2515 if (num_future_parts_for_part_iii > future_num_parts)
2516 future_num_parts = num_future_parts_for_part_iii;
2522 num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
2525 output_num_parts += num_partitions_in_current_dim;
2527 if((future_num_parts_of_part_ii == atomic_part_count) ||
2528 (future_num_parts_of_part_ii % atomic_part_count != 0)) {
2529 atomic_part_count = 1;
2532 mj_part_t ideal_num_future_parts_in_part =
2533 (future_num_parts_of_part_ii / atomic_part_count) /
2534 num_partitions_in_current_dim;
2535 for(mj_part_t iii = 0; iii < num_partitions_in_current_dim; ++iii) {
2536 mj_part_t num_future_parts_for_part_iii =
2537 ideal_num_future_parts_in_part;
2540 if(iii < (future_num_parts_of_part_ii / atomic_part_count) %
2541 num_partitions_in_current_dim) {
2543 ++num_future_parts_for_part_iii;
2546 next_future_num_parts_in_parts->push_back(
2547 num_future_parts_for_part_iii * atomic_part_count);
2551 if(this->mj_keep_part_boxes) {
2552 output_part_boxes->push_back((*input_part_boxes)[ii]);
2555 if(num_future_parts_for_part_iii > future_num_parts)
2556 future_num_parts = num_future_parts_for_part_iii;
2562 device_num_partitioning_in_current_dim = Kokkos::View<
2563 mj_part_t*,
device_t>(
"test", num_partitioning_in_current_dim.size());
2564 host_num_partitioning_in_current_dim =
2565 Kokkos::create_mirror_view(device_num_partitioning_in_current_dim);
2566 for(
size_t n = 0; n < num_partitioning_in_current_dim.size(); ++n) {
2567 host_num_partitioning_in_current_dim(n) =
2568 num_partitioning_in_current_dim[n];
2573 Kokkos::deep_copy(device_num_partitioning_in_current_dim,
2574 host_num_partitioning_in_current_dim);
2575 return output_num_parts;
2580 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2581 typename mj_part_t,
typename mj_node_t>
2582 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
2583 allocate_set_work_memory()
2588 this->coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>(
2589 Kokkos::ViewAllocateWithoutInitializing(
"coordinate_permutations"),
2590 this->num_local_coords);
2591 auto local_coordinate_permutations = coordinate_permutations;
2592 Kokkos::parallel_for(
2593 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t> (
2594 0, this->num_local_coords), KOKKOS_LAMBDA (mj_lno_t i) {
2595 local_coordinate_permutations(i) = i;
2599 this->new_coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>(
2600 Kokkos::ViewAllocateWithoutInitializing(
"num_local_coords"),
2601 this->num_local_coords);
2603 this->assigned_part_ids = Kokkos::View<mj_part_t*, device_t>(
2604 Kokkos::ViewAllocateWithoutInitializing(
"assigned parts"), 0);
2605 if(this->num_local_coords > 0) {
2606 this->assigned_part_ids = Kokkos::View<mj_part_t*, device_t>(
2607 Kokkos::ViewAllocateWithoutInitializing(
"assigned part ids"),
2608 this->num_local_coords);
2615 this->part_xadj = Kokkos::View<mj_lno_t*, device_t>(
2616 Kokkos::ViewAllocateWithoutInitializing(
"part xadj"), 1);
2617 this->host_part_xadj = Kokkos::create_mirror_view(part_xadj);
2618 host_part_xadj(0) = num_local_coords;
2619 Kokkos::deep_copy(this->part_xadj, host_part_xadj);
2622 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
2623 Kokkos::ViewAllocateWithoutInitializing(
"empty"), 0);
2626 this->all_cut_coordinates = Kokkos::View<mj_scalar_t*, device_t>(
2627 Kokkos::ViewAllocateWithoutInitializing(
"all cut coordinates"),
2628 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2631 this->process_cut_line_weight_to_put_left = Kokkos::View<mj_scalar_t*,
2632 device_t>(Kokkos::ViewAllocateWithoutInitializing(
"empty"), 0);
2636 this->thread_cut_line_weight_to_put_left =
2637 Kokkos::View<mj_scalar_t*, device_t>(
2638 Kokkos::ViewAllocateWithoutInitializing(
"empty"), 0);
2640 if(this->distribute_points_on_cut_lines) {
2641 this->process_cut_line_weight_to_put_left =
2642 Kokkos::View<mj_scalar_t *, device_t>(
2643 Kokkos::ViewAllocateWithoutInitializing(
2644 "process_cut_line_weight_to_put_left"),
2645 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2646 this->thread_cut_line_weight_to_put_left =
2647 Kokkos::View<mj_scalar_t *, device_t>(
2648 Kokkos::ViewAllocateWithoutInitializing(
2649 "thread_cut_line_weight_to_put_left"),
2650 this->max_num_cut_along_dim);
2651 this->process_rectilinear_cut_weight =
2652 Kokkos::View<mj_scalar_t *, device_t>(
2653 Kokkos::ViewAllocateWithoutInitializing(
"process_rectilinear_cut_weight"),
2654 this->max_num_cut_along_dim);
2655 this->global_rectilinear_cut_weight =
2656 Kokkos::View<mj_scalar_t *, device_t>(
2657 Kokkos::ViewAllocateWithoutInitializing(
"global_rectilinear_cut_weight"),
2658 this->max_num_cut_along_dim);
2665 this->cut_coordinates_work_array =
2666 Kokkos::View<mj_scalar_t *, device_t>(
2667 Kokkos::ViewAllocateWithoutInitializing(
"cut_coordinates_work_array"),
2668 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2671 this->target_part_weights = Kokkos::View<mj_scalar_t*, device_t>(
2672 Kokkos::ViewAllocateWithoutInitializing(
"target_part_weights"),
2673 this->max_num_part_along_dim * this->max_concurrent_part_calculation);
2676 this->cut_upper_bound_coordinates =
2677 Kokkos::View<mj_scalar_t*, device_t>(
2678 Kokkos::ViewAllocateWithoutInitializing(
"cut_upper_bound_coordinates"),
2679 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2682 this->cut_lower_bound_coordinates =
2683 Kokkos::View<mj_scalar_t*, device_t>(
2684 Kokkos::ViewAllocateWithoutInitializing(
"cut_lower_bound_coordinates"),
2685 this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2688 this->cut_lower_bound_weights =
2689 Kokkos::View<mj_scalar_t*, device_t>(
2690 Kokkos::ViewAllocateWithoutInitializing(
"cut_lower_bound_weights"),
2691 this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2694 this->cut_upper_bound_weights =
2695 Kokkos::View<mj_scalar_t*, device_t>(
2696 Kokkos::ViewAllocateWithoutInitializing(
"cut_upper_bound_weights"),
2697 this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2701 this->process_local_min_max_coord_total_weight =
2702 Kokkos::View<mj_scalar_t*, device_t>(
2703 Kokkos::ViewAllocateWithoutInitializing(
2704 "process_local_min_max_coord_total_weight"),
2705 3 * this->max_concurrent_part_calculation);
2708 this->global_min_max_coord_total_weight =
2709 Kokkos::View<mj_scalar_t*, device_t>(
2710 Kokkos::ViewAllocateWithoutInitializing(
"global_min_max_coord_total_weight"),
2711 3 * this->max_concurrent_part_calculation);
2716 this->is_cut_line_determined = Kokkos::View<bool *, device_t>(
2717 Kokkos::ViewAllocateWithoutInitializing(
"is_cut_line_determined"),
2718 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2724 this->device_incomplete_cut_count = Kokkos::View<mj_part_t *, device_t>(
2725 Kokkos::ViewAllocateWithoutInitializing(
"device_incomplete_cut_count"),
2726 this->max_concurrent_part_calculation);
2727 this->incomplete_cut_count =
2728 Kokkos::create_mirror_view(device_incomplete_cut_count);
2731 this->thread_part_weights = Kokkos::View<double *, device_t>(
2732 Kokkos::ViewAllocateWithoutInitializing(
"thread_part_weights"),
2733 this->max_num_total_part_along_dim * this->max_concurrent_part_calculation);
2735 this->thread_cut_left_closest_point = Kokkos::View<mj_scalar_t *, device_t>(
2736 Kokkos::ViewAllocateWithoutInitializing(
"thread_cut_left_closest_point"),
2737 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2741 this->thread_cut_right_closest_point = Kokkos::View<mj_scalar_t *, device_t>(
2742 Kokkos::ViewAllocateWithoutInitializing(
"thread_cut_right_closest_point"),
2743 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2746 this->thread_point_counts = Kokkos::View<mj_lno_t *, device_t>(
2747 Kokkos::ViewAllocateWithoutInitializing(
"thread_point_counts"),
2748 this->max_num_part_along_dim);
2754 this->total_part_weight_left_right_closests =
2755 Kokkos::View<mj_scalar_t*, device_t>(
2756 Kokkos::ViewAllocateWithoutInitializing(
2757 "total_part_weight_left_right_closests"),
2758 (this->max_num_total_part_along_dim + this->max_num_cut_along_dim * 2) *
2759 this->max_concurrent_part_calculation);
2761 this->global_total_part_weight_left_right_closests =
2762 Kokkos::View<mj_scalar_t*, device_t>(
2763 Kokkos::ViewAllocateWithoutInitializing(
2764 "global_total_part_weight_left_right_closests"),
2765 (this->max_num_total_part_along_dim +
2766 this->max_num_cut_along_dim * 2) * this->max_concurrent_part_calculation);
2768 this->current_mj_gnos = Kokkos::View<mj_gno_t*, device_t>(
2769 Kokkos::ViewAllocateWithoutInitializing(
"gids"), num_local_coords);
2771 this->owner_of_coordinate = Kokkos::View<int *, Kokkos::HostSpace>(
2772 Kokkos::ViewAllocateWithoutInitializing(
"owner_of_coordinate"),
2778 Kokkos::deep_copy(owner_of_coordinate, myActualRank);
2780 auto local_current_mj_gnos = current_mj_gnos;
2781 auto local_initial_mj_gnos = initial_mj_gnos;
2782 Kokkos::parallel_for(
2783 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2784 (0, num_local_coords), KOKKOS_LAMBDA (mj_lno_t j) {
2785 local_current_mj_gnos(j) = local_initial_mj_gnos(j);
2791 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2792 typename mj_part_t,
typename mj_node_t>
2793 void AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t,
2794 mj_node_t>::compute_global_box()
2797 mj_scalar_t *mins =
new mj_scalar_t[this->coord_dim];
2799 mj_scalar_t *gmins =
new mj_scalar_t[this->coord_dim];
2801 mj_scalar_t *maxs =
new mj_scalar_t[this->coord_dim];
2803 mj_scalar_t *gmaxs =
new mj_scalar_t[this->coord_dim];
2805 auto local_mj_coordinates = this->mj_coordinates;
2809 for(
int i = 0; i < this->coord_dim; ++i) {
2814 for(
int i = 0; i < std::min(this->recursion_depth, this->coord_dim); ++i) {
2815 Kokkos::parallel_reduce(
"MinReduce",
2816 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2817 (0, this->num_local_coords),
2818 KOKKOS_LAMBDA(mj_lno_t j, mj_scalar_t & running_min) {
2819 if(local_mj_coordinates(j,i) < running_min) {
2820 running_min = local_mj_coordinates(j,i);
2822 }, Kokkos::Min<mj_scalar_t>(mins[i]));
2823 Kokkos::parallel_reduce(
"MaxReduce",
2824 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2825 (0, this->num_local_coords),
2826 KOKKOS_LAMBDA(mj_lno_t j, mj_scalar_t & running_max) {
2827 if(local_mj_coordinates(j,i) > running_max) {
2828 running_max = local_mj_coordinates(j,i);
2830 }, Kokkos::Max<mj_scalar_t>(maxs[i]));
2833 reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MIN,
2834 this->coord_dim, mins, gmins
2837 reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MAX,
2838 this->coord_dim, maxs, gmaxs
2842 global_box = rcp(
new mj_partBox_t(0,this->coord_dim,gmins,gmaxs));
2856 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2857 typename mj_part_t,
typename mj_node_t>
2858 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
2859 mj_node_t>::init_part_boxes(
2860 RCP<mj_partBoxVector_t> & initial_partitioning_boxes)
2862 mj_partBox_t tmp_box(*global_box);
2863 initial_partitioning_boxes->push_back(tmp_box);
2870 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2873 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
2874 mj_get_local_min_max_coord_totW(
2875 mj_part_t current_work_part,
2876 mj_part_t current_concurrent_num_parts,
2877 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords)
2879 auto local_coordinate_permutations = this->coordinate_permutations;
2880 auto local_process_local_min_max_coord_total_weight =
2881 this->process_local_min_max_coord_total_weight;
2882 auto local_mj_weights = this->mj_weights;
2884 bool bUniformWeights = mj_uniform_weights(0);
2886 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk) {
2888 mj_part_t concurrent_current_part = current_work_part + kk;
2889 mj_lno_t coordinate_begin_index = concurrent_current_part == 0 ? 0 :
2890 host_part_xadj(concurrent_current_part - 1);
2891 mj_lno_t coordinate_end_index =
2892 host_part_xadj(concurrent_current_part);
2894 mj_scalar_t my_min_coord = 0;
2895 mj_scalar_t my_max_coord = 0;
2896 mj_scalar_t my_total_weight;
2899 if(coordinate_begin_index >= coordinate_end_index)
2901 my_min_coord = std::numeric_limits<mj_scalar_t>::max();
2902 my_max_coord = -std::numeric_limits<mj_scalar_t>::max();
2903 my_total_weight = 0;
2907 Kokkos::parallel_reduce(
"get min",
2908 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2909 (coordinate_begin_index, coordinate_end_index),
2910 KOKKOS_LAMBDA (mj_lno_t j, mj_scalar_t & running_min) {
2911 int i = local_coordinate_permutations(j);
2912 if(mj_current_dim_coords(i) < running_min)
2913 running_min = mj_current_dim_coords(i);
2914 }, Kokkos::Min<mj_scalar_t>(my_min_coord));
2916 Kokkos::parallel_reduce(
"get max",
2917 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2918 (coordinate_begin_index, coordinate_end_index),
2919 KOKKOS_LAMBDA (mj_lno_t j, mj_scalar_t & running_max) {
2920 int i = local_coordinate_permutations(j);
2921 if(mj_current_dim_coords(i) > running_max)
2922 running_max = mj_current_dim_coords(i);
2923 }, Kokkos::Max<mj_scalar_t>(my_max_coord));
2924 if(bUniformWeights) {
2925 my_total_weight = coordinate_end_index - coordinate_begin_index;
2928 my_total_weight = 0;
2929 Kokkos::parallel_reduce(
"get weight",
2930 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2931 (coordinate_begin_index, coordinate_end_index),
2932 KOKKOS_LAMBDA (mj_lno_t j, mj_scalar_t & lsum) {
2933 int i = local_coordinate_permutations(j);
2934 lsum += local_mj_weights(i,0);
2935 }, my_total_weight);
2940 Kokkos::parallel_for(
2941 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
2942 (0, 1), KOKKOS_LAMBDA (
int dummy) {
2943 local_process_local_min_max_coord_total_weight(kk) =
2945 local_process_local_min_max_coord_total_weight(
2946 kk + current_concurrent_num_parts) = my_max_coord;
2947 local_process_local_min_max_coord_total_weight(
2948 kk + 2*current_concurrent_num_parts) = my_total_weight;
2965 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2966 typename mj_part_t,
typename mj_node_t>
2967 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
2968 mj_node_t>::mj_get_global_min_max_coord_totW(
2969 mj_part_t current_concurrent_num_parts,
2970 Kokkos::View<mj_scalar_t *, device_t> & local_min_max_total,
2971 Kokkos::View<mj_scalar_t *, device_t> & global_min_max_total) {
2975 if(this->comm->getSize() > 1) {
2978 auto host_local_min_max_total =
2979 Kokkos::create_mirror_view(Kokkos::HostSpace(), local_min_max_total);
2980 auto host_global_min_max_total =
2981 Kokkos::create_mirror_view(Kokkos::HostSpace(), global_min_max_total);
2982 Kokkos::deep_copy(host_local_min_max_total, local_min_max_total);
2984 reductionOp(current_concurrent_num_parts,
2985 current_concurrent_num_parts, current_concurrent_num_parts);
2987 reduceAll<int, mj_scalar_t>(
2990 3 * current_concurrent_num_parts,
2991 host_local_min_max_total.data(),
2992 host_global_min_max_total.data());
2995 Kokkos::deep_copy(global_min_max_total, host_global_min_max_total);
2998 mj_part_t s = 3 * current_concurrent_num_parts;
2999 Kokkos::parallel_for(
3000 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
3001 (0, s), KOKKOS_LAMBDA (mj_part_t i) {
3002 global_min_max_total(i) = local_min_max_total(i);
3039 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3040 typename mj_part_t,
typename mj_node_t>
3041 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
3042 mj_get_initial_cut_coords_target_weights(
3043 mj_scalar_t min_coord,
3044 mj_scalar_t max_coord,
3045 mj_part_t num_cuts ,
3046 mj_scalar_t global_weight,
3048 Kokkos::View<mj_scalar_t *, device_t> & initial_cut_coords,
3050 Kokkos::View<mj_scalar_t *, device_t> & current_target_part_weights ,
3051 std::vector <mj_part_t> *future_num_part_in_parts,
3052 std::vector <mj_part_t> *next_future_num_parts_in_parts,
3053 mj_part_t concurrent_current_part,
3054 mj_part_t obtained_part_index,
3055 mj_part_t num_target_first_level_parts,
3056 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & target_first_level_dist)
3058 mj_scalar_t coord_range = max_coord - min_coord;
3065 bool bUniformPartsCheck =
3066 num_target_first_level_parts <= 1 && this->mj_uniform_parts(0);
3068 if(!bUniformPartsCheck) {
3069 bool bValidNonUniformTargetWeights =
3070 (num_target_first_level_parts > 1 && target_first_level_dist.size() != 0);
3071 if(!bValidNonUniformTargetWeights) {
3072 std::cerr <<
"MJ does not support non uniform part weights beyond the first partition" << std::endl;
3077 Kokkos::View<mj_scalar_t*, device_t> device_cumulative(
3078 "device_cumulative", num_cuts);
3079 auto host_cumulative = Kokkos::create_mirror_view(device_cumulative);
3081 mj_scalar_t cumulative = 0;
3083 if(bUniformPartsCheck) {
3085 mj_scalar_t total_future_part_count_in_part =
3086 static_cast<mj_scalar_t
>((*future_num_part_in_parts)[concurrent_current_part]);
3089 mj_scalar_t unit_part_weight =
3090 global_weight / total_future_part_count_in_part;
3092 for(mj_part_t i = 0; i < num_cuts; ++i) {
3093 cumulative += unit_part_weight *
static_cast<mj_scalar_t
>((*next_future_num_parts_in_parts)[i + obtained_part_index]);
3094 host_cumulative(i) = cumulative;
3099 mj_scalar_t sum_target_first_level_dist = 0.0;
3100 for (
int i = 0; i < num_target_first_level_parts; ++i) {
3101 sum_target_first_level_dist += target_first_level_dist(i);
3104 for(mj_part_t i = 0; i < num_cuts; ++i) {
3105 cumulative += global_weight * target_first_level_dist(i) /
3106 sum_target_first_level_dist;
3107 host_cumulative(i) = cumulative;
3111 Kokkos::deep_copy(device_cumulative, host_cumulative);
3113 Kokkos::parallel_for(
"Write num in parts",
3114 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
3115 (0, num_cuts), KOKKOS_LAMBDA(mj_part_t cut) {
3117 current_target_part_weights(cut) = device_cumulative(cut);
3118 initial_cut_coords(cut) = min_coord +
3119 (coord_range * device_cumulative(cut)) / global_weight;
3121 current_target_part_weights(num_cuts) = global_weight;
3127 if (!bUniformPartsCheck || this->mj_uniform_weights[0]) {
3128 Kokkos::parallel_for(
3129 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
3131 KOKKOS_LAMBDA (mj_part_t i) {
3132 current_target_part_weights(i) =
3133 long(current_target_part_weights(i) + 0.5);
3154 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3155 typename mj_part_t,
typename mj_node_t>
3156 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
3157 set_initial_coordinate_parts(
3158 mj_scalar_t &max_coordinate,
3159 mj_scalar_t &min_coordinate,
3160 mj_lno_t coordinate_begin_index,
3161 mj_lno_t coordinate_end_index,
3162 Kokkos::View<mj_lno_t *, device_t> & mj_current_coordinate_permutations,
3163 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
3164 Kokkos::View<mj_part_t *, device_t> & mj_part_ids,
3165 mj_part_t &partition_count)
3167 mj_scalar_t coordinate_range = max_coordinate - min_coordinate;
3171 if(std::abs(coordinate_range) < this->sEpsilon ) {
3172 Kokkos::parallel_for(
3173 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
3174 (coordinate_begin_index, coordinate_end_index),
3175 KOKKOS_LAMBDA (mj_lno_t ii) {
3176 mj_part_ids(mj_current_coordinate_permutations[ii]) = 0;
3182 mj_scalar_t slice = coordinate_range / partition_count;
3183 Kokkos::parallel_for(
3184 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
3185 (coordinate_begin_index, coordinate_end_index),
3186 KOKKOS_LAMBDA (mj_lno_t ii) {
3187 mj_lno_t iii = mj_current_coordinate_permutations[ii];
3189 mj_part_t((mj_current_dim_coords[iii] - min_coordinate) / slice);
3190 if(pp >= partition_count) {
3191 pp = partition_count - 1;
3193 mj_part_ids[iii] = 2 * pp;
3212 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3213 typename mj_part_t,
typename mj_node_t>
3214 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,mj_node_t>::mj_1D_part(
3215 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
3216 double used_imbalance_tolerance,
3217 mj_part_t current_work_part,
3218 mj_part_t current_concurrent_num_parts,
3219 Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
3220 mj_part_t total_incomplete_cut_count,
3221 Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count,
3222 Kokkos::View<size_t*, device_t> & view_total_reduction_size)
3224 this->temp_cut_coords = current_cut_coordinates;
3227 *reductionOp = NULL;
3229 bool bSingleProcess = (this->comm->getSize() == 1);
3231 std::vector<mj_part_t> temp(host_num_partitioning_in_current_dim.size());
3232 if(!bSingleProcess) {
3233 for(
size_t n = 0; n < host_num_partitioning_in_current_dim.size(); ++n) {
3234 temp[n] = host_num_partitioning_in_current_dim(n);
3237 <mj_part_t, mj_scalar_t>(
3240 current_concurrent_num_parts);
3243 auto local_cut_lower_bound_coordinates =
3244 cut_lower_bound_coordinates;
3245 auto local_cut_upper_bound_coordinates =
3246 cut_upper_bound_coordinates;
3247 auto local_cut_upper_bound_weights = cut_upper_bound_weights;
3248 auto local_cut_lower_bound_weights = cut_lower_bound_weights;
3249 bool local_distribute_points_on_cut_lines = distribute_points_on_cut_lines;
3250 auto local_process_cut_line_weight_to_put_left =
3251 process_cut_line_weight_to_put_left;
3252 auto local_temp_cut_coords = temp_cut_coords;
3253 auto local_global_total_part_weight_left_right_closests =
3254 global_total_part_weight_left_right_closests;
3255 auto local_cut_coordinates_work_array =
3256 cut_coordinates_work_array;
3257 auto local_part_xadj = part_xadj;
3258 auto local_global_min_max_coord_total_weight =
3259 global_min_max_coord_total_weight;
3260 auto local_target_part_weights =
3261 target_part_weights;
3262 auto local_global_rectilinear_cut_weight =
3263 global_rectilinear_cut_weight;
3264 auto local_process_rectilinear_cut_weight =
3265 process_rectilinear_cut_weight;
3267 auto local_is_cut_line_determined = this->is_cut_line_determined;
3268 auto local_device_num_partitioning_in_current_dim =
3269 device_num_partitioning_in_current_dim;
3271 Kokkos::parallel_for(
3272 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
3273 KOKKOS_LAMBDA (
int dummy) {
3276 view_rectilinear_cut_count(0) = 0;
3277 view_total_reduction_size(0) = 0;
3281 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i) {
3282 mj_part_t num_part_in_dim =
3283 local_device_num_partitioning_in_current_dim(current_work_part + i);
3284 mj_part_t num_cut_in_dim = num_part_in_dim - 1;
3285 view_total_reduction_size(0) += (4 * num_cut_in_dim + 1);
3287 for(mj_part_t ii = 0; ii < num_cut_in_dim; ++ii) {
3288 local_is_cut_line_determined(next) =
false;
3290 local_cut_lower_bound_coordinates(next) =
3291 local_global_min_max_coord_total_weight(i);
3293 local_cut_upper_bound_coordinates(next) =
3294 local_global_min_max_coord_total_weight(
3295 i + current_concurrent_num_parts);
3297 local_cut_upper_bound_weights(next) =
3298 local_global_min_max_coord_total_weight(
3299 i + 2 * current_concurrent_num_parts);
3300 local_cut_lower_bound_weights(next) = 0;
3301 if(local_distribute_points_on_cut_lines) {
3302 local_process_cut_line_weight_to_put_left(next) = 0;
3313 while (total_incomplete_cut_count != 0) {
3314 this->mj_1D_part_get_part_weights(
3315 current_concurrent_num_parts,
3317 mj_current_dim_coords,
3321 this->mj_combine_rightleft_and_weights(
3323 current_concurrent_num_parts);
3326 if(!bSingleProcess) {
3329 auto host_total_part_weight_left_right_closests =
3330 Kokkos::create_mirror_view(Kokkos::HostSpace(),
3331 total_part_weight_left_right_closests);
3332 auto host_global_total_part_weight_left_right_closests =
3333 Kokkos::create_mirror_view(Kokkos::HostSpace(),
3334 global_total_part_weight_left_right_closests);
3336 Kokkos::deep_copy(host_total_part_weight_left_right_closests,
3337 total_part_weight_left_right_closests);
3339 size_t host_view_total_reduction_size;
3340 Kokkos::parallel_reduce(
"Read single",
3341 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
3342 KOKKOS_LAMBDA(
int dummy,
size_t & set_single) {
3343 set_single = view_total_reduction_size(0);
3344 }, host_view_total_reduction_size);
3346 reduceAll<int, mj_scalar_t>( *(this->comm), *reductionOp,
3347 host_view_total_reduction_size,
3348 host_total_part_weight_left_right_closests.data(),
3349 host_global_total_part_weight_left_right_closests.data());
3350 Kokkos::deep_copy(global_total_part_weight_left_right_closests,
3351 host_global_total_part_weight_left_right_closests);
3354 local_global_total_part_weight_left_right_closests =
3355 this->total_part_weight_left_right_closests;
3360 mj_part_t cut_shift = 0;
3364 size_t tlr_shift = 0;
3366 Kokkos::View<mj_part_t*, Kokkos::HostSpace>
3367 save_initial_incomplete_cut_count(
"save_initial_incomplete_cut_count",
3368 current_concurrent_num_parts);
3370 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
3372 mj_part_t num_parts =
3373 host_num_partitioning_in_current_dim(current_work_part + kk);
3375 mj_part_t num_cuts = num_parts - 1;
3376 size_t num_total_part = num_parts + size_t (num_cuts);
3381 mj_part_t kk_incomplete_cut_count = this->incomplete_cut_count(kk);
3383 if(kk_incomplete_cut_count == 0) {
3384 cut_shift += num_cuts;
3385 tlr_shift += (num_total_part + 2 * num_cuts);
3389 Kokkos::View<mj_scalar_t *, device_t> current_local_part_weights =
3390 Kokkos::subview(this->total_part_weight_left_right_closests,
3391 std::pair<mj_lno_t, mj_lno_t>(
3393 this->total_part_weight_left_right_closests.size()));
3395 Kokkos::View<mj_scalar_t *, device_t> current_global_tlr =
3397 local_global_total_part_weight_left_right_closests,
3398 std::pair<mj_lno_t, mj_lno_t>(
3400 local_global_total_part_weight_left_right_closests.size()));
3401 Kokkos::View<mj_scalar_t *, device_t>
3402 current_global_left_closest_points =
3403 Kokkos::subview(current_global_tlr,
3404 std::pair<mj_lno_t, mj_lno_t>(
3406 current_global_tlr.size()));
3407 Kokkos::View<mj_scalar_t *, device_t>
3408 current_global_right_closest_points =
3409 Kokkos::subview(current_global_tlr,
3410 std::pair<mj_lno_t, mj_lno_t>(
3411 num_total_part + num_cuts,
3412 current_global_tlr.size()));
3413 Kokkos::View<mj_scalar_t *, device_t> current_global_part_weights =
3416 Kokkos::View<bool *, device_t> current_cut_line_determined =
3417 Kokkos::subview(this->is_cut_line_determined,
3418 std::pair<mj_lno_t, mj_lno_t>(
3420 this->is_cut_line_determined.size()));
3421 Kokkos::View<mj_scalar_t *, device_t> current_part_target_weights =
3422 Kokkos::subview(local_target_part_weights,
3423 std::pair<mj_lno_t, mj_lno_t>(
3425 local_target_part_weights.size()));
3426 Kokkos::View<mj_scalar_t *, device_t>
3427 current_part_cut_line_weight_to_put_left =
3428 Kokkos::subview(local_process_cut_line_weight_to_put_left,
3429 std::pair<mj_lno_t, mj_lno_t>(
3431 local_process_cut_line_weight_to_put_left.size()));
3433 save_initial_incomplete_cut_count(kk) =
3434 kk_incomplete_cut_count;
3436 Kokkos::View<mj_scalar_t *, device_t>
3437 current_cut_lower_bound_weights =
3438 Kokkos::subview(local_cut_lower_bound_weights,
3439 std::pair<mj_lno_t, mj_lno_t>(
3441 local_cut_lower_bound_weights.size()));
3442 Kokkos::View<mj_scalar_t *, device_t> current_cut_upper_weights =
3443 Kokkos::subview(local_cut_upper_bound_weights,
3444 std::pair<mj_lno_t, mj_lno_t>(
3446 local_cut_upper_bound_weights.size()));
3447 Kokkos::View<mj_scalar_t *, device_t> current_cut_upper_bounds =
3448 Kokkos::subview(local_cut_upper_bound_coordinates,
3449 std::pair<mj_lno_t, mj_lno_t>(
3451 local_cut_upper_bound_coordinates.size()));
3452 Kokkos::View<mj_scalar_t *, device_t> current_cut_lower_bounds =
3453 Kokkos::subview(local_cut_lower_bound_coordinates,
3454 std::pair<mj_lno_t, mj_lno_t>(
3456 local_cut_lower_bound_coordinates.size()));
3459 Kokkos::View<mj_scalar_t*, device_t> sub_temp_cut_coords =
3460 Kokkos::subview(this->temp_cut_coords,
3461 std::pair<mj_lno_t, mj_lno_t>(
3462 cut_shift, this->temp_cut_coords.size()));
3463 Kokkos::View<mj_scalar_t*, device_t> sub_cut_coordinates_work_array =
3464 Kokkos::subview(this->cut_coordinates_work_array,
3465 std::pair<mj_lno_t, mj_lno_t>(
3466 cut_shift, this->cut_coordinates_work_array.size()));
3468 this->mj_get_new_cut_coordinates(
3469 current_concurrent_num_parts,
3472 used_imbalance_tolerance,
3473 current_global_part_weights,
3474 current_local_part_weights,
3475 current_part_target_weights,
3476 current_cut_line_determined,
3477 sub_temp_cut_coords,
3478 current_cut_upper_bounds,
3479 current_cut_lower_bounds,
3480 current_global_left_closest_points,
3481 current_global_right_closest_points,
3482 current_cut_lower_bound_weights,
3483 current_cut_upper_weights,
3484 sub_cut_coordinates_work_array,
3485 current_part_cut_line_weight_to_put_left,
3486 view_rectilinear_cut_count);
3488 cut_shift += num_cuts;
3489 tlr_shift += (num_total_part + 2 * num_cuts);
3492 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
3493 mj_part_t iteration_complete_cut_count =
3494 save_initial_incomplete_cut_count(kk) - this->incomplete_cut_count(kk);
3495 total_incomplete_cut_count -= iteration_complete_cut_count;
3498 Kokkos::parallel_for(
3499 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
3500 (0, local_temp_cut_coords.size()), KOKKOS_LAMBDA(
int n) {
3501 auto t = local_temp_cut_coords(n);
3502 local_temp_cut_coords(n) = local_cut_coordinates_work_array(n);
3503 local_cut_coordinates_work_array(n) = t;
3511 if(current_cut_coordinates != local_temp_cut_coords) {
3512 Kokkos::parallel_for(
3513 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
3514 (0, 1), KOKKOS_LAMBDA(
int dummy) {
3516 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i) {
3517 mj_part_t num_parts = -1;
3518 num_parts = local_device_num_partitioning_in_current_dim(
3519 current_work_part + i);
3520 mj_part_t num_cuts = num_parts - 1;
3521 for(mj_part_t ii = 0; ii < num_cuts; ++ii) {
3522 current_cut_coordinates(next + ii) = local_temp_cut_coords(next + ii);
3527 static_cast<int>(local_cut_coordinates_work_array.size()); ++n) {
3528 local_cut_coordinates_work_array(n) = local_temp_cut_coords(n);
3536 template<
class scalar_t>
3542 KOKKOS_INLINE_FUNCTION
3545 KOKKOS_INLINE_FUNCTION
3549 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
3551 template<
class policy_t,
class scalar_t,
class part_t>
3562 scalar_t mj_max_scalar,
3564 int mj_value_count_rightleft,
3565 int mj_value_count_weights) :
3572 KOKKOS_INLINE_FUNCTION
3577 KOKKOS_INLINE_FUNCTION
3580 dst.
ptr[n] += src.
ptr[n];
3583 for(
int n = value_count_weights + 2;
3585 if(src.
ptr[n] > dst.
ptr[n]) {
3588 if(src.
ptr[n+1] < dst.
ptr[n+1]) {
3589 dst.
ptr[n+1] = src.
ptr[n+1];
3594 KOKKOS_INLINE_FUNCTION
3597 dst.
ptr[n] += src.
ptr[n];
3600 for(
int n = value_count_weights + 2;
3602 if(src.
ptr[n] > dst.
ptr[n]) {
3605 if(src.
ptr[n+1] < dst.
ptr[n+1]) {
3606 dst.
ptr[n+1] = src.
ptr[n+1];
3618 for(
int n = value_count_weights;
3625 #endif // KOKKOS_ENABLE_CUDA && KOKKOS_ENABLE_HIP
3627 template<
class policy_t,
class scalar_t,
class part_t,
class index_t,
3633 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
3656 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3657 Kokkos::View<double *, device_t> current_part_weights;
3658 Kokkos::View<scalar_t *, device_t> current_left_closest;
3659 Kokkos::View<scalar_t *, device_t> current_right_closest;
3660 #endif // KOKKOS_ENABLE_CUDA || defined(KOKKOS_ENABLE_HIP)
3664 array_t mj_max_scalar,
3665 part_t mj_concurrent_current_part,
3667 part_t mj_current_work_part,
3668 part_t mj_current_concurrent_num_parts,
3669 part_t mj_left_right_array_size,
3670 part_t mj_weight_array_size,
3671 Kokkos::View<index_t*, device_t> & mj_permutations,
3672 Kokkos::View<scalar_t *, device_t> & mj_coordinates,
3673 Kokkos::View<scalar_t**, device_t> & mj_weights,
3674 Kokkos::View<part_t*, device_t> & mj_parts,
3675 Kokkos::View<scalar_t *, device_t> & mj_cut_coordinates,
3676 Kokkos::View<index_t *, device_t> & mj_part_xadj,
3677 bool mj_uniform_weights0,
3678 scalar_t mj_sEpsilon
3679 #
if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3680 ,Kokkos::View<double *, device_t> & mj_current_part_weights,
3681 Kokkos::View<scalar_t *, device_t> & mj_current_left_closest,
3682 Kokkos::View<scalar_t *, device_t> & mj_current_right_closest
3685 loop_count(mj_loop_count),
3687 concurrent_current_part(mj_concurrent_current_part),
3688 num_cuts(mj_num_cuts),
3689 current_work_part(mj_current_work_part),
3690 current_concurrent_num_parts(mj_current_concurrent_num_parts),
3693 value_count(mj_weight_array_size+mj_left_right_array_size),
3702 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3703 ,current_part_weights(mj_current_part_weights),
3704 current_left_closest(mj_current_left_closest),
3705 current_right_closest(mj_current_right_closest)
3711 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3712 int result =
sizeof(array_t) *
3715 int result =
sizeof(array_t) *
3720 int remainder = result % 8;
3721 if(remainder != 0) {
3722 result += 8 - remainder;
3727 KOKKOS_INLINE_FUNCTION
3728 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3734 index_t all_begin = (concurrent_current_part == 0) ? 0 :
3736 index_t all_end =
part_xadj(concurrent_current_part);
3738 index_t num_working_points = all_end - all_begin;
3739 int num_teams = teamMember.league_size();
3741 index_t stride = num_working_points / num_teams;
3742 if((num_working_points % num_teams) > 0) {
3749 index_t begin = all_begin + stride * teamMember.league_rank();
3750 index_t end = begin + stride;
3755 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3759 array_t * shared_ptr = (array_t *) teamMember.team_shmem().get_shmem(
3763 Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
3767 for(
int n = value_count_weights;
3773 teamMember.team_barrier();
3775 Kokkos::parallel_for(
3776 Kokkos::TeamThreadRange(teamMember, begin, end),
3778 #else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3783 array_t * shared_ptr = (array_t *) teamMember.team_shmem().get_shmem(
3796 Kokkos::parallel_reduce(
3797 Kokkos::TeamThreadRange(teamMember, begin, end),
3798 #
if (__cplusplus > 201703L)
3803 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3810 index_t part =
parts(i)/2;
3821 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3822 Kokkos::atomic_add(&shared_ptr[part*2], w);
3824 threadSum.
ptr[part*2] += w;
3830 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3831 array_t new_value = (array_t) coord;
3833 while(new_value < prev_value) {
3834 prev_value = Kokkos::atomic_compare_exchange(
3836 prev_value, new_value);
3839 while(new_value > prev_value) {
3840 prev_value = Kokkos::atomic_compare_exchange(
3842 prev_value, new_value);
3859 else if(part != num_cuts) {
3860 if(coord < b + sEpsilon && coord > b -
sEpsilon) {
3864 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3865 Kokkos::atomic_add(&shared_ptr[part*2+1], w);
3869 threadSum.
ptr[part*2+1] += w;
3874 parts(i) = part*2+1;
3880 part_t base_b = part;
3883 while(part < num_cuts) {
3885 scalar_t delta = b - base_coord;
3886 if(delta < 0) delta = -delta;
3891 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3892 Kokkos::atomic_add(&shared_ptr[part*2+1], w);
3896 threadSum.
ptr[part*2+1] += w;
3907 scalar_t delta = b - base_coord;
3908 if(delta < 0) delta = -delta;
3913 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3914 Kokkos::atomic_add(&shared_ptr[part*2+1], w);
3918 threadSum.
ptr[part*2+1] += w;
3931 if(loop_count != 0) {
3943 if(part == lower + 1) {
3948 part -= (part - lower)/2;
3951 else if(part == upper - 1) {
3956 part += (upper - part)/2;
3960 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3962 #else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3963 }, arraySumReducer);
3964 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3966 teamMember.team_barrier();
3969 #if (__cplusplus > 201703L)
3970 Kokkos::single(Kokkos::PerTeam(teamMember), [=,
this] () {
3972 Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
3975 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3976 Kokkos::atomic_add(¤t_part_weights(n),
3977 static_cast<double>(shared_ptr[n]));
3978 #else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3979 teamSum[n] += array.
ptr[n];
3980 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3983 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3984 int insert_left = 0;
3985 int insert_right = 0;
3988 for(
int n = 2 + value_count_weights;
3990 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3991 scalar_t new_value = shared_ptr[n+1];
3992 scalar_t prev_value = current_right_closest(insert_right);
3993 while(new_value < prev_value) {
3994 prev_value = Kokkos::atomic_compare_exchange(
3995 ¤t_right_closest(insert_right), prev_value, new_value);
3998 new_value = shared_ptr[n];
3999 prev_value = current_left_closest(insert_left);
4000 while(new_value > prev_value) {
4001 prev_value = Kokkos::atomic_compare_exchange(
4002 ¤t_left_closest(insert_left), prev_value, new_value);
4007 #else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4008 if(array.
ptr[n] > teamSum[n]) {
4009 teamSum[n] = array.
ptr[n];
4011 if(array.
ptr[n+1] < teamSum[n+1]) {
4012 teamSum[n+1] = array.
ptr[n+1];
4014 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4018 teamMember.team_barrier();
4021 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4022 KOKKOS_INLINE_FUNCTION
4028 for(
int n = value_count_weights + 2;
4030 if(src[n] > dst[n]) {
4033 if(src[n+1] < dst[n+1]) {
4034 dst[n+1] = src[n+1];
4044 for(
int n = value_count_weights;
4050 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4060 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4061 typename mj_part_t,
typename mj_node_t>
4062 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t,mj_part_t, mj_node_t>::
4063 mj_1D_part_get_part_weights(
4064 mj_part_t current_concurrent_num_parts,
4065 mj_part_t current_work_part,
4066 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
4069 auto local_is_cut_line_determined = is_cut_line_determined;
4070 auto local_thread_part_weights = thread_part_weights;
4071 auto local_thread_cut_left_closest_point = thread_cut_left_closest_point;
4072 auto local_thread_cut_right_closest_point = thread_cut_right_closest_point;
4076 auto local_sEpsilon = this->
sEpsilon;
4077 auto local_assigned_part_ids = this->assigned_part_ids;
4078 auto local_coordinate_permutations = this->coordinate_permutations;
4079 auto local_mj_weights = this->mj_weights;
4081 auto local_global_min_max_coord_total_weight =
4082 this->global_min_max_coord_total_weight;
4084 typedef Kokkos::TeamPolicy<typename mj_node_t::execution_space> policy_t;
4086 auto local_device_num_partitioning_in_current_dim =
4087 device_num_partitioning_in_current_dim;
4089 Kokkos::deep_copy(device_incomplete_cut_count, this->incomplete_cut_count);
4090 auto local_device_incomplete_cut_count = device_incomplete_cut_count;
4092 mj_part_t total_part_shift = 0;
4094 mj_part_t concurrent_cut_shifts = 0;
4096 Kokkos::View<mj_scalar_t *, device_t> local_temp_cut_coords =
4097 Kokkos::subview(temp_cut_coords, std::pair<mj_lno_t, mj_lno_t>(
4098 concurrent_cut_shifts, temp_cut_coords.size()));
4100 mj_part_t num_parts =
4101 host_num_partitioning_in_current_dim(current_work_part + kk);
4102 mj_part_t num_cuts = num_parts - 1;
4103 mj_part_t total_part_count = num_parts +
num_cuts;
4104 mj_part_t weight_array_length = num_cuts + num_parts;
4107 mj_part_t right_left_array_length = (num_cuts + 2) * 2;
4109 if(this->incomplete_cut_count(kk) == 0) {
4110 total_part_shift += total_part_count;
4116 auto policy_ReduceWeightsFunctor = policy_t(
4117 mj_num_teams ? mj_num_teams : 60, Kokkos::AUTO);
4119 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4120 int total_array_length =
4121 weight_array_length + right_left_array_length;
4128 typedef mj_scalar_t array_t;
4130 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4131 Kokkos::View<array_t*, Kokkos::HostSpace> reduce_array(
"reduce_array", total_array_length);
4132 #endif // KOKKOS_ENABLE_CUDA && KOKKOS_ENABLE_HIP
4134 int offset_cuts = 0;
4135 for(
int kk2 = 0; kk2 < kk; ++kk2) {
4137 host_num_partitioning_in_current_dim(current_work_part + kk2) - 1;
4139 Kokkos::View<double *, device_t> my_current_part_weights =
4140 Kokkos::subview(local_thread_part_weights,
4141 std::pair<mj_lno_t, mj_lno_t>(total_part_shift,
4142 total_part_shift + total_part_count));
4143 Kokkos::View<mj_scalar_t *, device_t> my_current_left_closest =
4144 Kokkos::subview(local_thread_cut_left_closest_point,
4145 std::pair<mj_lno_t, mj_lno_t>(
4147 local_thread_cut_left_closest_point.size()));
4148 Kokkos::View<mj_scalar_t *, device_t> my_current_right_closest =
4149 Kokkos::subview(local_thread_cut_right_closest_point,
4150 std::pair<mj_lno_t, mj_lno_t>(
4152 local_thread_cut_right_closest_point.size()));
4154 array_t
max_scalar = std::numeric_limits<array_t>::max();
4156 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4158 Kokkos::parallel_for(
4159 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4160 KOKKOS_LAMBDA (
int dummy) {
4161 for(
int n = 0; n < weight_array_length; ++n) {
4162 my_current_part_weights(n) = 0;
4164 for(
int n = 0; n <
num_cuts; ++n) {
4171 mj_part_t concurrent_current_part =
4172 current_work_part + kk;
4175 typename mj_node_t::device_type, array_t>
4179 concurrent_current_part,
4182 current_concurrent_num_parts,
4183 right_left_array_length,
4184 weight_array_length,
4185 coordinate_permutations,
4186 mj_current_dim_coords,
4189 local_temp_cut_coords,
4191 mj_uniform_weights(0),
4193 #
if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4194 ,my_current_part_weights,
4195 my_current_left_closest,
4196 my_current_right_closest
4200 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4201 Kokkos::parallel_for(policy_ReduceWeightsFunctor, teamFunctor);
4203 Kokkos::parallel_reduce(policy_ReduceWeightsFunctor,
4204 teamFunctor, reduce_array);
4208 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4209 auto hostArray = Kokkos::create_mirror_view(my_current_part_weights);
4211 for(
int i = 0; i < static_cast<int>(total_part_count); ++i) {
4212 hostArray(i) = reduce_array[i];
4215 Kokkos::deep_copy(my_current_part_weights, hostArray);
4217 auto hostLeftArray = Kokkos::create_mirror_view(my_current_left_closest);
4218 auto hostRightArray = Kokkos::create_mirror_view(my_current_right_closest);
4219 for(mj_part_t cut = 0; cut <
num_cuts; ++cut) {
4220 hostLeftArray(cut) = reduce_array[weight_array_length + (cut+1)*2+0];
4221 hostRightArray(cut) = reduce_array[weight_array_length + (cut+1)*2+1];
4223 Kokkos::deep_copy(my_current_left_closest, hostLeftArray);
4224 Kokkos::deep_copy(my_current_right_closest, hostRightArray);
4227 total_part_shift += total_part_count;
4231 auto local_temp_cut_coords = temp_cut_coords;
4233 Kokkos::parallel_for(
4234 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
4235 (0, current_concurrent_num_parts), KOKKOS_LAMBDA(mj_part_t kk) {
4236 mj_part_t num_parts = local_device_num_partitioning_in_current_dim(
4237 current_work_part + kk);
4238 mj_part_t num_cuts = num_parts - 1;
4239 mj_part_t total_part_count = num_parts +
num_cuts;
4241 if(local_device_incomplete_cut_count(kk) > 0) {
4245 size_t offset_cuts = 0;
4246 for(mj_part_t kk2 = 0; kk2 < kk; ++kk2) {
4247 auto num_parts_kk2 = local_device_num_partitioning_in_current_dim(
4248 current_work_part + kk2);
4249 offset += num_parts_kk2 * 2 - 1;
4250 offset_cuts += num_parts_kk2 - 1;
4253 for(mj_part_t i = 1; i < total_part_count; ++i) {
4257 if(i % 2 == 0 && i > 1 && i < total_part_count - 1 &&
4258 std::abs(local_temp_cut_coords(offset_cuts + i / 2) -
4259 local_temp_cut_coords(offset_cuts + i /2 - 1))
4264 local_thread_part_weights(offset + i)
4265 = local_thread_part_weights(offset + i-2);
4270 local_thread_part_weights(offset + i) +=
4271 local_thread_part_weights(offset + i-1);
4284 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4285 typename mj_part_t,
typename mj_node_t>
4286 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
4287 mj_combine_rightleft_and_weights(
4288 mj_part_t current_work_part,
4289 mj_part_t current_concurrent_num_parts)
4291 auto local_thread_part_weights = this->thread_part_weights;
4292 auto local_is_cut_line_determined = this->is_cut_line_determined;
4293 auto local_thread_cut_left_closest_point =
4294 this->thread_cut_left_closest_point;
4295 auto local_thread_cut_right_closest_point =
4296 this->thread_cut_right_closest_point;
4297 auto local_total_part_weight_left_right_closests =
4298 this->total_part_weight_left_right_closests;
4299 auto local_device_num_partitioning_in_current_dim =
4300 device_num_partitioning_in_current_dim;
4301 Kokkos::parallel_for(
4302 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>(0,1),
4303 KOKKOS_LAMBDA (
int dummy) {
4305 size_t tlr_array_shift = 0;
4306 mj_part_t cut_shift = 0;
4307 size_t total_part_array_shift = 0;
4313 mj_part_t num_parts_in_part =
4314 local_device_num_partitioning_in_current_dim(current_work_part + i);
4315 mj_part_t num_cuts_in_part = num_parts_in_part - 1;
4316 size_t num_total_part_in_part =
4317 num_parts_in_part + size_t (num_cuts_in_part);
4320 for(
int ii = 0; ii < num_cuts_in_part; ++ii) {
4321 mj_part_t next = tlr_array_shift + ii;
4322 mj_part_t cut_index = cut_shift + ii;
4324 if(!local_is_cut_line_determined(cut_index)) {
4325 mj_scalar_t left_closest_in_process =
4326 local_thread_cut_left_closest_point(cut_index);
4327 mj_scalar_t right_closest_in_process =
4328 local_thread_cut_right_closest_point(cut_index);
4331 local_total_part_weight_left_right_closests(
4332 num_total_part_in_part + next) = left_closest_in_process;
4334 local_total_part_weight_left_right_closests(
4335 num_total_part_in_part + num_cuts_in_part + next) =
4336 right_closest_in_process;
4340 for(
size_t j = 0; j < num_total_part_in_part; ++j) {
4341 mj_part_t cut_ind = j / 2 + cut_shift;
4347 if(j == num_total_part_in_part - 1 ||
4348 !local_is_cut_line_determined(cut_ind)) {
4349 double pwj = local_thread_part_weights(total_part_array_shift + j);
4350 local_total_part_weight_left_right_closests(tlr_array_shift + j) = pwj;
4355 cut_shift += num_cuts_in_part;
4356 tlr_array_shift += num_total_part_in_part + 2 * num_cuts_in_part;
4357 total_part_array_shift += num_total_part_in_part;
4374 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4375 typename mj_part_t,
typename mj_node_t>
4376 KOKKOS_INLINE_FUNCTION
4377 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
4378 mj_node_t>::mj_calculate_new_cut_position(mj_scalar_t cut_upper_bound,
4379 mj_scalar_t cut_lower_bound,
4380 mj_scalar_t cut_upper_weight,
4381 mj_scalar_t cut_lower_weight,
4382 mj_scalar_t expected_weight,
4383 mj_scalar_t &new_cut_position,
4386 if(std::abs(cut_upper_bound - cut_lower_bound) < sEpsilon) {
4387 new_cut_position = cut_upper_bound;
4390 if(std::abs(cut_upper_weight - cut_lower_weight) < sEpsilon) {
4391 new_cut_position = cut_lower_bound;
4394 mj_scalar_t coordinate_range = (cut_upper_bound - cut_lower_bound);
4395 mj_scalar_t weight_range = (cut_upper_weight - cut_lower_weight);
4396 mj_scalar_t my_weight_diff = (expected_weight - cut_lower_weight);
4398 mj_scalar_t required_shift = (my_weight_diff / weight_range);
4399 int scale_constant = 20;
4400 int shiftint= int (required_shift * scale_constant);
4401 if(shiftint == 0) shiftint = 1;
4402 required_shift = mj_scalar_t (shiftint) / scale_constant;
4403 new_cut_position = coordinate_range * required_shift + cut_lower_bound;
4406 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4408 template<
class policy_t,
class scalar_t>
4418 int mj_value_count) :
4423 KOKKOS_INLINE_FUNCTION
4428 KOKKOS_INLINE_FUNCTION
4431 dst.
ptr[n] += src.
ptr[n];
4436 dst.
ptr = value->ptr;
4445 template<
class policy_t,
class scalar_t,
class part_t,
class index_t,
4451 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4463 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4464 Kokkos::View<int *, device_t> local_point_counts;
4465 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4468 part_t mj_concurrent_current_part,
4469 part_t mj_weight_array_size,
4470 Kokkos::View<index_t*, device_t> & mj_permutations,
4471 Kokkos::View<scalar_t *, device_t> & mj_coordinates,
4472 Kokkos::View<part_t*, device_t> & mj_parts,
4473 Kokkos::View<index_t *, device_t> & mj_part_xadj,
4474 Kokkos::View<index_t *, device_t> & mj_track_on_cuts
4475 #
if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4476 ,Kokkos::View<int *, device_t> & mj_local_point_counts
4479 concurrent_current_part(mj_concurrent_current_part),
4485 track_on_cuts(mj_track_on_cuts)
4486 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4487 ,local_point_counts(mj_local_point_counts)
4493 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4496 int result =
sizeof(array_t) * (
value_count) * team_size;
4500 int remainder = result % 8;
4501 if(remainder != 0) {
4502 result += 8 - remainder;
4507 KOKKOS_INLINE_FUNCTION
4508 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4513 index_t all_begin = (concurrent_current_part == 0) ? 0 :
4515 index_t all_end =
part_xadj(concurrent_current_part);
4517 index_t num_working_points = all_end - all_begin;
4518 int num_teams = teamMember.league_size();
4520 index_t stride = num_working_points / num_teams;
4521 if((num_working_points % num_teams) > 0) {
4525 index_t begin = all_begin + stride * teamMember.league_rank();
4526 index_t end = begin + stride;
4531 int track_on_cuts_insert_index = track_on_cuts.size() - 1;
4534 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4535 size_t sh_mem_size =
sizeof(array_t) * (
value_count);
4537 size_t sh_mem_size =
4538 sizeof(array_t) * (
value_count) * teamMember.team_size();
4541 array_t * shared_ptr = (array_t *) teamMember.team_shmem().get_shmem(
4544 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4546 Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
4551 teamMember.team_barrier();
4553 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, begin, end),
4555 #else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4563 Kokkos::parallel_reduce(
4564 Kokkos::TeamThreadRange(teamMember, begin, end),
4565 #
if (__cplusplus > 201703L)
4570 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4573 part_t place =
parts(coordinate_index);
4574 part_t part = place / 2;
4575 if(place % 2 == 0) {
4576 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4577 Kokkos::atomic_add(&shared_ptr[part], 1);
4579 threadSum.
ptr[part] += 1;
4582 parts(coordinate_index) = part;
4587 index_t set_index = Kokkos::atomic_fetch_add(
4588 &track_on_cuts(track_on_cuts_insert_index), 1);
4589 track_on_cuts(set_index) = ii;
4591 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4593 #else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4595 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4597 teamMember.team_barrier();
4600 #if (__cplusplus > 201703L)
4601 Kokkos::single(Kokkos::PerTeam(teamMember), [=,
this] () {
4603 Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
4606 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4607 Kokkos::atomic_add(&local_point_counts(n), shared_ptr[n]);
4608 #else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4609 teamSum[n] += array.
ptr[n];
4610 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4614 teamMember.team_barrier();
4617 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4619 KOKKOS_INLINE_FUNCTION
4649 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4650 typename mj_part_t,
typename mj_node_t>
4651 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
4652 mj_create_new_partitions(
4653 mj_part_t num_parts,
4654 mj_part_t current_concurrent_work_part,
4655 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
4656 Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
4657 Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
4658 Kokkos::View<mj_lno_t *, device_t> & out_part_xadj)
4661 auto local_thread_part_weight_work = this->thread_part_weight_work;
4662 auto local_point_counts = this->thread_point_counts;
4663 auto local_distribute_points_on_cut_lines =
4664 this->distribute_points_on_cut_lines;
4665 auto local_thread_cut_line_weight_to_put_left =
4666 this->thread_cut_line_weight_to_put_left;
4667 auto local_sEpsilon = this->
sEpsilon;
4668 auto local_coordinate_permutations = this->coordinate_permutations;
4669 auto local_mj_weights = this->mj_weights;
4670 auto local_assigned_part_ids = this->assigned_part_ids;
4671 auto local_new_coordinate_permutations = this->new_coordinate_permutations;
4673 mj_part_t num_cuts = num_parts - 1;
4675 Kokkos::parallel_for(
4676 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4677 KOKKOS_LAMBDA(
int dummy) {
4679 if(local_distribute_points_on_cut_lines) {
4680 for(
int i = 0; i <
num_cuts; ++i) {
4681 mj_scalar_t left_weight = used_local_cut_line_weight_to_left(i);
4682 if(left_weight > local_sEpsilon) {
4684 mj_scalar_t thread_ii_weight_on_cut =
4685 local_thread_part_weight_work(i * 2 + 1) -
4686 local_thread_part_weight_work(i * 2);
4688 if(thread_ii_weight_on_cut < left_weight) {
4690 local_thread_cut_line_weight_to_put_left(i) =
4691 thread_ii_weight_on_cut;
4695 local_thread_cut_line_weight_to_put_left(i) = left_weight;
4697 left_weight -= thread_ii_weight_on_cut;
4700 local_thread_cut_line_weight_to_put_left(i) = 0;
4706 for(mj_part_t i = num_cuts - 1; i > 0 ; --i) {
4707 if(std::abs(current_concurrent_cut_coordinate(i) -
4708 current_concurrent_cut_coordinate(i -1)) < local_sEpsilon) {
4709 local_thread_cut_line_weight_to_put_left(i) -=
4710 local_thread_cut_line_weight_to_put_left(i - 1);
4712 local_thread_cut_line_weight_to_put_left(i) =
4713 static_cast<long long>((local_thread_cut_line_weight_to_put_left(i) +
4714 least_signifiance) * significance_mul) /
4715 static_cast<mj_scalar_t
>(significance_mul);
4719 for(mj_part_t i = 0; i < num_parts; ++i) {
4720 local_point_counts(i) = 0;
4724 mj_lno_t coordinate_begin_index =
4725 current_concurrent_work_part == 0 ? 0 :
4726 host_part_xadj(current_concurrent_work_part - 1);
4727 mj_lno_t coordinate_end_index =
4728 host_part_xadj(current_concurrent_work_part);
4730 mj_lno_t total_on_cut;
4731 Kokkos::parallel_reduce(
"Get total_on_cut",
4732 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (
4733 coordinate_begin_index, coordinate_end_index),
4734 KOKKOS_LAMBDA(
int ii, mj_lno_t & val) {
4735 mj_lno_t coordinate_index = local_coordinate_permutations(ii);
4736 mj_part_t coordinate_assigned_place =
4737 local_assigned_part_ids(coordinate_index);
4738 if(coordinate_assigned_place % 2 == 1) {
4743 Kokkos::View<mj_lno_t *, device_t> track_on_cuts;
4744 if(total_on_cut > 0) {
4745 track_on_cuts = Kokkos::View<mj_lno_t *, device_t>(
4756 typedef Kokkos::TeamPolicy<typename mj_node_t::execution_space> policy_t;
4759 int use_num_teams = mj_num_teams ? mj_num_teams : 60;
4761 auto policy_ReduceFunctor = policy_t(use_num_teams, Kokkos::AUTO);
4762 typedef int array_t;
4766 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4767 Kokkos::View<array_t*, Kokkos::HostSpace> reduce_array(
"reduce_array", num_parts);
4770 ReduceArrayFunctor<policy_t, mj_scalar_t, mj_part_t, mj_lno_t,
4771 typename mj_node_t::device_type, array_t>teamFunctor(
4772 current_concurrent_work_part,
4774 coordinate_permutations,
4775 mj_current_dim_coords,
4779 #
if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4784 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4785 Kokkos::parallel_for(policy_ReduceFunctor, teamFunctor);
4787 Kokkos::parallel_reduce(policy_ReduceFunctor, teamFunctor, reduce_array);
4791 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4792 for(mj_part_t part = 0; part < num_parts; ++part) {
4793 local_point_counts(part) = reduce_array[part];
4799 if(track_on_cuts.size() > 0) {
4800 auto track_on_cuts_sort = Kokkos::subview(track_on_cuts,
4801 std::pair<mj_lno_t, mj_lno_t>(0, track_on_cuts.size() - 1));
4802 Kokkos::sort(track_on_cuts_sort);
4806 Kokkos::parallel_for(
4807 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4808 KOKKOS_LAMBDA (
int dummy) {
4810 for(
int j = 0; j < total_on_cut; ++j) {
4811 int ii = track_on_cuts(j);
4812 mj_lno_t coordinate_index = local_coordinate_permutations(ii);
4813 mj_scalar_t coordinate_weight = uniform_weights0 ? 1 :
4814 local_mj_weights(coordinate_index,0);
4815 mj_part_t coordinate_assigned_place =
4816 local_assigned_part_ids(coordinate_index);
4817 mj_part_t coordinate_assigned_part = coordinate_assigned_place / 2;
4819 if(local_distribute_points_on_cut_lines &&
4820 local_thread_cut_line_weight_to_put_left(
4821 coordinate_assigned_part) > local_sEpsilon) {
4825 local_thread_cut_line_weight_to_put_left(
4826 coordinate_assigned_part) -= coordinate_weight;
4831 if(local_thread_cut_line_weight_to_put_left(
4832 coordinate_assigned_part) < 0 && coordinate_assigned_part <
4834 std::abs(current_concurrent_cut_coordinate(
4835 coordinate_assigned_part+1) -
4836 current_concurrent_cut_coordinate(
4837 coordinate_assigned_part)) < local_sEpsilon)
4839 local_thread_cut_line_weight_to_put_left(
4840 coordinate_assigned_part + 1) +=
4841 local_thread_cut_line_weight_to_put_left(
4842 coordinate_assigned_part);
4844 ++local_point_counts(coordinate_assigned_part);
4845 local_assigned_part_ids(coordinate_index) =
4846 coordinate_assigned_part;
4851 ++coordinate_assigned_part;
4854 while(local_distribute_points_on_cut_lines &&
4855 coordinate_assigned_part < num_cuts)
4858 if(std::abs(current_concurrent_cut_coordinate(
4859 coordinate_assigned_part) -
4860 current_concurrent_cut_coordinate(
4861 coordinate_assigned_part - 1)) < local_sEpsilon)
4864 if(local_thread_cut_line_weight_to_put_left(
4865 coordinate_assigned_part) > local_sEpsilon &&
4866 local_thread_cut_line_weight_to_put_left(
4867 coordinate_assigned_part) >=
4868 std::abs(local_thread_cut_line_weight_to_put_left(
4869 coordinate_assigned_part) - coordinate_weight))
4871 local_thread_cut_line_weight_to_put_left(
4872 coordinate_assigned_part) -= coordinate_weight;
4876 if(local_thread_cut_line_weight_to_put_left(
4877 coordinate_assigned_part) < 0 &&
4878 coordinate_assigned_part < num_cuts - 1 &&
4879 std::abs(current_concurrent_cut_coordinate(
4880 coordinate_assigned_part+1) -
4881 current_concurrent_cut_coordinate(
4882 coordinate_assigned_part)) < local_sEpsilon)
4884 local_thread_cut_line_weight_to_put_left(
4885 coordinate_assigned_part + 1) +=
4886 local_thread_cut_line_weight_to_put_left(
4887 coordinate_assigned_part);
4895 ++coordinate_assigned_part;
4897 local_point_counts(coordinate_assigned_part) += 1;
4898 local_assigned_part_ids(coordinate_index) = coordinate_assigned_part;
4902 for(
int j = 0; j < num_parts; ++j) {
4903 out_part_xadj(j) = local_point_counts(j);
4904 local_point_counts(j) = 0;
4907 out_part_xadj(j) += out_part_xadj(j - 1);
4908 local_point_counts(j) += out_part_xadj(j - 1);
4916 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4921 Kokkos::parallel_for(
4922 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t> (
4923 coordinate_begin_index, coordinate_end_index),
4924 KOKKOS_LAMBDA (mj_lno_t ii) {
4925 mj_lno_t i = local_coordinate_permutations(ii);
4926 mj_part_t p = local_assigned_part_ids(i);
4927 mj_lno_t idx = Kokkos::atomic_fetch_add(&local_point_counts(p), 1);
4928 local_new_coordinate_permutations(coordinate_begin_index + idx) = i;
4933 #ifdef KOKKOS_ENABLE_OPENMP
4935 const int num_threads = 1;
4937 const int num_threads = 1;
4940 const int num_teams = 1;
4943 Kokkos::View<mj_lno_t*, device_t>
4944 point_counter(
"insert indices", num_teams * num_threads * num_parts);
4948 Kokkos::TeamPolicy<typename mj_node_t::execution_space>
4949 block_policy(num_teams, num_threads);
4950 typedef typename Kokkos::TeamPolicy<typename mj_node_t::execution_space>
::
4952 mj_lno_t range = coordinate_end_index - coordinate_begin_index;
4953 mj_lno_t block_size = range / num_teams + 1;
4954 Kokkos::parallel_for(block_policy, KOKKOS_LAMBDA(member_type team_member) {
4955 int team = team_member.league_rank();
4956 int team_offset = team * num_threads * num_parts;
4957 mj_lno_t begin = coordinate_begin_index + team * block_size;
4958 mj_lno_t end = begin + block_size;
4959 if(end > coordinate_end_index) {
4960 end = coordinate_end_index;
4963 Kokkos::parallel_for(Kokkos::TeamThreadRange(team_member, begin, end),
4965 int thread = team_member.team_rank();
4966 mj_lno_t i = local_coordinate_permutations(ii);
4967 mj_part_t p = local_assigned_part_ids(i);
4968 int index = team_offset + thread * num_parts + p;
4969 ++point_counter(index);
4977 Kokkos::parallel_for(
4978 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4979 KOKKOS_LAMBDA (
int dummy) {
4980 int num_sets = point_counter.size() / num_parts;
4981 for(
int set = num_sets - 1; set >= 1; set -=1) {
4982 int base = set * num_parts;
4983 for(
int part = 0; part < num_parts; ++part) {
4984 point_counter(base + part) = point_counter(base + part - num_parts);
4988 for(
int part = 0; part < num_parts; ++part) {
4989 point_counter(part) = 0;
4992 for(
int set = 1; set < num_sets; ++set) {
4993 int base = set * num_parts;
4994 for(
int part = 0; part < num_parts; ++part) {
4995 point_counter(base + part) += point_counter(base + part - num_parts);
5001 Kokkos::parallel_for(block_policy, KOKKOS_LAMBDA(member_type team_member) {
5002 int team = team_member.league_rank();
5003 int team_offset = team * num_threads * num_parts;
5004 mj_lno_t begin = coordinate_begin_index + team * block_size;
5005 mj_lno_t end = begin + block_size;
5006 if(end > coordinate_end_index) {
5007 end = coordinate_end_index;
5009 Kokkos::parallel_for(Kokkos::TeamThreadRange(team_member, begin, end),
5011 int thread = team_member.team_rank();
5012 mj_lno_t i = local_coordinate_permutations(ii);
5013 mj_part_t p = local_assigned_part_ids(i);
5014 int index = team_offset + thread * num_parts + p;
5015 int set_counter = (point_counter(index)++) + local_point_counts(p);
5016 local_new_coordinate_permutations(coordinate_begin_index + set_counter) = i;
5065 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5066 typename mj_part_t,
typename mj_node_t>
5067 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
5068 mj_node_t>::mj_get_new_cut_coordinates(
5069 mj_part_t current_concurrent_num_parts,
5071 const mj_part_t &num_cuts,
5072 const double &used_imbalance_tolerance,
5073 Kokkos::View<mj_scalar_t *, device_t> & current_global_part_weights,
5074 Kokkos::View<mj_scalar_t *, device_t> & current_local_part_weights,
5075 Kokkos::View<mj_scalar_t *, device_t> & current_part_target_weights,
5076 Kokkos::View<bool *, device_t> & current_cut_line_determined,
5077 Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
5078 Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_bounds,
5079 Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bounds,
5080 Kokkos::View<mj_scalar_t *, device_t> & current_global_left_closest_points,
5081 Kokkos::View<mj_scalar_t *, device_t> & current_global_right_closest_points,
5082 Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bound_weights,
5083 Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_weights,
5084 Kokkos::View<mj_scalar_t *, device_t> & new_current_cut_coordinates,
5085 Kokkos::View<mj_scalar_t *, device_t> &
5086 current_part_cut_line_weight_to_put_left,
5087 Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count)
5089 Kokkos::deep_copy(device_incomplete_cut_count, this->incomplete_cut_count);
5091 auto local_device_incomplete_cut_count = device_incomplete_cut_count;
5093 auto local_distribute_points_on_cut_lines = distribute_points_on_cut_lines;
5094 auto local_global_rectilinear_cut_weight = global_rectilinear_cut_weight;
5095 auto local_process_rectilinear_cut_weight = process_rectilinear_cut_weight;
5096 auto local_global_min_max_coord_total_weight =
5097 global_min_max_coord_total_weight;
5099 const auto _sEpsilon = this->
sEpsilon;
5107 Kokkos::TeamPolicy<typename mj_node_t::execution_space>
5108 policy_one_team(1, Kokkos::AUTO());
5109 typedef typename Kokkos::TeamPolicy<typename mj_node_t::execution_space>
::
5111 Kokkos::parallel_for(policy_one_team, KOKKOS_LAMBDA(member_type team_member) {
5113 mj_scalar_t min_coordinate =
5114 local_global_min_max_coord_total_weight(kk);
5115 mj_scalar_t max_coordinate =
5116 local_global_min_max_coord_total_weight(
5117 kk + current_concurrent_num_parts);
5118 mj_scalar_t global_total_weight =
5119 local_global_min_max_coord_total_weight(
5120 kk + current_concurrent_num_parts * 2);
5122 Kokkos::parallel_for(Kokkos::TeamThreadRange (team_member, num_cuts),
5127 current_global_left_closest_points(i) > local_sEpsilon) {
5128 current_global_left_closest_points(i) =
5129 current_cut_coordinates(i);
5131 if(current_global_right_closest_points(i) -
5132 max_coordinate > local_sEpsilon) {
5133 current_global_right_closest_points(i) =
5134 current_cut_coordinates(i);
5137 team_member.team_barrier();
5139 Kokkos::parallel_for(Kokkos::TeamThreadRange (team_member, num_cuts),
5141 using algMJ_t = AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
5144 mj_scalar_t seen_weight_in_part = 0;
5146 mj_scalar_t expected_weight_in_part = 0;
5148 double imbalance_on_left = 0, imbalance_on_right = 0;
5149 if(local_distribute_points_on_cut_lines) {
5151 local_global_rectilinear_cut_weight(i) = 0;
5152 local_process_rectilinear_cut_weight(i) = 0;
5154 bool bContinue =
false;
5157 if(current_cut_line_determined(i)) {
5158 new_current_cut_coordinates(i) =
5159 current_cut_coordinates(i);
5164 seen_weight_in_part = current_global_part_weights(i * 2);
5167 expected_weight_in_part = current_part_target_weights(i);
5170 imbalance_on_left = algMJ_t::calculate_imbalance(seen_weight_in_part,
5171 expected_weight_in_part);
5174 imbalance_on_right = algMJ_t::calculate_imbalance(global_total_weight -
5175 seen_weight_in_part, global_total_weight - expected_weight_in_part);
5176 bool is_left_imbalance_valid = std::abs(imbalance_on_left) -
5177 used_imbalance_tolerance < local_sEpsilon ;
5178 bool is_right_imbalance_valid = std::abs(imbalance_on_right) -
5179 used_imbalance_tolerance < local_sEpsilon;
5181 if(is_left_imbalance_valid && is_right_imbalance_valid) {
5182 current_cut_line_determined(i) =
true;
5183 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5184 new_current_cut_coordinates(i) = current_cut_coordinates(i);
5186 else if(imbalance_on_left < 0) {
5188 if(local_distribute_points_on_cut_lines) {
5193 if(current_global_part_weights(i * 2 + 1) ==
5194 expected_weight_in_part) {
5196 current_cut_line_determined(i) =
true;
5197 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5200 new_current_cut_coordinates(i) =
5201 current_cut_coordinates(i);
5203 current_part_cut_line_weight_to_put_left(i) =
5204 current_local_part_weights(i * 2 + 1) -
5205 current_local_part_weights(i * 2);
5208 else if(current_global_part_weights(i * 2 + 1) >
5209 expected_weight_in_part) {
5212 current_cut_line_determined(i) =
true;
5213 Kokkos::atomic_add(&view_rectilinear_cut_count(0), 1);
5217 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5218 new_current_cut_coordinates(i) =
5219 current_cut_coordinates(i);
5220 local_process_rectilinear_cut_weight[i] =
5221 current_local_part_weights(i * 2 + 1) -
5222 current_local_part_weights(i * 2);
5231 current_cut_lower_bounds(i) =
5232 current_global_right_closest_points(i);
5235 current_cut_lower_bound_weights(i) = seen_weight_in_part;
5240 for(mj_part_t ii = i + 1; ii <
num_cuts ; ++ii) {
5241 mj_scalar_t p_weight = current_global_part_weights(ii * 2);
5242 mj_scalar_t line_weight =
5243 current_global_part_weights(ii * 2 + 1);
5244 if(p_weight >= expected_weight_in_part) {
5249 if(p_weight == expected_weight_in_part) {
5250 current_cut_upper_bounds(i) =
5251 current_cut_coordinates(ii);
5252 current_cut_upper_weights(i) = p_weight;
5253 current_cut_lower_bounds(i) =
5254 current_cut_coordinates(ii);
5255 current_cut_lower_bound_weights(i) = p_weight;
5256 }
else if(p_weight < current_cut_upper_weights(i)) {
5259 current_cut_upper_bounds(i) =
5260 current_global_left_closest_points(ii);
5261 current_cut_upper_weights(i) = p_weight;
5267 if(line_weight >= expected_weight_in_part) {
5271 current_cut_upper_bounds(i) =
5272 current_cut_coordinates(ii);
5273 current_cut_upper_weights(i) = line_weight;
5274 current_cut_lower_bounds(i) =
5275 current_cut_coordinates(ii);
5276 current_cut_lower_bound_weights(i) = p_weight;
5281 if(p_weight <= expected_weight_in_part && p_weight >=
5282 current_cut_lower_bound_weights(i)) {
5283 current_cut_lower_bounds(i) =
5284 current_global_right_closest_points(ii);
5285 current_cut_lower_bound_weights(i) = p_weight;
5289 mj_scalar_t new_cut_position = 0;
5290 algMJ_t::mj_calculate_new_cut_position(
5291 current_cut_upper_bounds(i),
5292 current_cut_lower_bounds(i),
5293 current_cut_upper_weights(i),
5294 current_cut_lower_bound_weights(i),
5295 expected_weight_in_part, new_cut_position,
5300 if(std::abs(current_cut_coordinates(i) -
5301 new_cut_position) < local_sEpsilon) {
5302 current_cut_line_determined(i) =
true;
5303 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5306 new_current_cut_coordinates(i) =
5307 current_cut_coordinates(i);
5309 new_current_cut_coordinates(i) = new_cut_position;
5315 current_cut_upper_bounds(i) =
5316 current_global_left_closest_points(i);
5317 current_cut_upper_weights(i) =
5318 seen_weight_in_part;
5321 for(
int ii = i - 1; ii >= 0; --ii) {
5322 mj_scalar_t p_weight =
5323 current_global_part_weights(ii * 2);
5324 mj_scalar_t line_weight =
5325 current_global_part_weights(ii * 2 + 1);
5326 if(p_weight <= expected_weight_in_part) {
5327 if(p_weight == expected_weight_in_part) {
5330 current_cut_upper_bounds(i) =
5331 current_cut_coordinates(ii);
5332 current_cut_upper_weights(i) = p_weight;
5333 current_cut_lower_bounds(i) =
5334 current_cut_coordinates(ii);
5335 current_cut_lower_bound_weights(i) = p_weight;
5337 else if(p_weight > current_cut_lower_bound_weights(i)) {
5340 current_cut_lower_bounds(i) =
5341 current_global_right_closest_points(ii);
5342 current_cut_lower_bound_weights(i) = p_weight;
5348 if(line_weight > expected_weight_in_part) {
5349 current_cut_upper_bounds(i) =
5350 current_global_right_closest_points(ii);
5351 current_cut_upper_weights(i) = line_weight;
5361 if(p_weight >= expected_weight_in_part &&
5362 (p_weight < current_cut_upper_weights(i) ||
5363 (p_weight == current_cut_upper_weights(i) &&
5364 current_cut_upper_bounds(i) >
5365 current_global_left_closest_points(ii)))) {
5366 current_cut_upper_bounds(i) =
5367 current_global_left_closest_points(ii);
5368 current_cut_upper_weights(i) = p_weight;
5371 mj_scalar_t new_cut_position = 0;
5372 algMJ_t::mj_calculate_new_cut_position(
5373 current_cut_upper_bounds(i),
5374 current_cut_lower_bounds(i),
5375 current_cut_upper_weights(i),
5376 current_cut_lower_bound_weights(i),
5377 expected_weight_in_part,
5382 if(std::abs(current_cut_coordinates(i) -
5383 new_cut_position) < local_sEpsilon) {
5384 current_cut_line_determined(i) =
true;
5385 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5387 new_current_cut_coordinates(i) =
5388 current_cut_coordinates(i);
5390 new_current_cut_coordinates(i) =
5397 team_member.team_barrier();
5401 mj_part_t rectilinear_cut_count;
5402 Kokkos::parallel_reduce(
"Read bDoingWork",
5403 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>(0, 1),
5404 KOKKOS_LAMBDA(
int dummy,
int & set_single) {
5405 set_single = view_rectilinear_cut_count(0);
5406 }, rectilinear_cut_count);
5408 if(rectilinear_cut_count > 0) {
5409 auto host_local_process_rectilinear_cut_weight =
5410 Kokkos::create_mirror_view(Kokkos::HostSpace(),
5411 local_process_rectilinear_cut_weight);
5412 auto host_local_global_rectilinear_cut_weight =
5413 Kokkos::create_mirror_view(Kokkos::HostSpace(),
5414 local_global_rectilinear_cut_weight);
5415 Kokkos::deep_copy(host_local_process_rectilinear_cut_weight,
5416 local_process_rectilinear_cut_weight);
5417 Kokkos::deep_copy(host_local_global_rectilinear_cut_weight,
5418 local_global_rectilinear_cut_weight);
5419 Teuchos::scan<int,mj_scalar_t>(
5420 *comm, Teuchos::REDUCE_SUM,
5422 host_local_process_rectilinear_cut_weight.data(),
5423 host_local_global_rectilinear_cut_weight.data());
5424 Kokkos::deep_copy(local_process_rectilinear_cut_weight,
5425 host_local_process_rectilinear_cut_weight);
5426 Kokkos::deep_copy(local_global_rectilinear_cut_weight,
5427 host_local_global_rectilinear_cut_weight);
5429 Kokkos::parallel_for(
"finish up mj_get_new_cut_coordinates",
5430 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
5431 KOKKOS_LAMBDA(
int dummy) {
5432 for(mj_part_t i = 0; i <
num_cuts; ++i) {
5434 if(local_global_rectilinear_cut_weight(i) > 0) {
5436 mj_scalar_t expected_part_weight = current_part_target_weights(i);
5438 mj_scalar_t necessary_weight_on_line_for_left =
5439 expected_part_weight - current_global_part_weights(i * 2);
5442 mj_scalar_t my_weight_on_line =
5443 local_process_rectilinear_cut_weight(i);
5447 mj_scalar_t weight_on_line_upto_process_inclusive =
5448 local_global_rectilinear_cut_weight(i);
5452 mj_scalar_t space_to_put_left =
5453 necessary_weight_on_line_for_left -
5454 weight_on_line_upto_process_inclusive;
5457 mj_scalar_t space_left_to_me =
5458 space_to_put_left + my_weight_on_line;
5471 if(space_left_to_me < 0) {
5474 current_part_cut_line_weight_to_put_left(i) = 0;
5476 else if(space_left_to_me >= my_weight_on_line) {
5480 current_part_cut_line_weight_to_put_left(i) =
5487 current_part_cut_line_weight_to_put_left(i) =
5494 view_rectilinear_cut_count(0) = 0;
5498 Kokkos::deep_copy(this->incomplete_cut_count, device_incomplete_cut_count);
5510 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5511 typename mj_part_t,
typename mj_node_t>
5512 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5513 get_processor_num_points_in_parts(
5514 mj_part_t num_procs,
5515 mj_part_t num_parts,
5516 mj_gno_t *&num_points_in_all_processor_parts)
5519 size_t allocation_size = num_parts * (num_procs + 1);
5526 mj_gno_t *num_local_points_in_each_part_to_reduce_sum =
5527 new mj_gno_t[allocation_size];
5531 mj_gno_t *my_local_points_to_reduce_sum =
5532 num_local_points_in_each_part_to_reduce_sum + num_procs * num_parts;
5536 mj_gno_t *my_local_point_counts_in_each_part =
5537 num_local_points_in_each_part_to_reduce_sum + this->myRank * num_parts;
5540 memset(num_local_points_in_each_part_to_reduce_sum, 0,
5541 sizeof(mj_gno_t)*allocation_size);
5543 auto local_new_part_xadj = this->new_part_xadj;
5544 Kokkos::View<mj_gno_t *, typename mj_node_t::device_type> points_per_part(
5545 Kokkos::ViewAllocateWithoutInitializing(
"points per part"), num_parts);
5546 Kokkos::parallel_for(
"get vals on device",
5547 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_gno_t>
5548 (0, num_parts), KOKKOS_LAMBDA(mj_gno_t i) {
5549 points_per_part(i) =
5550 local_new_part_xadj(i) - ((i == 0) ? 0 : local_new_part_xadj(i-1));
5552 auto host_points_per_part = Kokkos::create_mirror_view(points_per_part);
5553 Kokkos::deep_copy(host_points_per_part, points_per_part);
5554 for(
int i = 0; i < num_parts; ++i) {
5555 my_local_points_to_reduce_sum[i] = host_points_per_part(i);
5560 memcpy (my_local_point_counts_in_each_part, my_local_points_to_reduce_sum,
5561 sizeof(mj_gno_t) * (num_parts) );
5568 reduceAll<int, mj_gno_t>(
5570 Teuchos::REDUCE_SUM,
5572 num_local_points_in_each_part_to_reduce_sum,
5573 num_points_in_all_processor_parts);
5577 delete [] num_local_points_in_each_part_to_reduce_sum;
5595 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5596 typename mj_part_t, typename mj_node_t>
5597 bool AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5598 mj_check_to_migrate(
5599 size_t migration_reduce_all_population,
5600 mj_lno_t num_coords_for_last_dim_part,
5601 mj_part_t num_procs,
5602 mj_part_t num_parts,
5603 mj_gno_t *num_points_in_all_processor_parts)
5606 if(migration_reduce_all_population > future_reduceall_cutoff) {
5611 if(num_coords_for_last_dim_part < min_work_last_dim) {
5616 if(this->check_migrate_avoid_migration_option == 0) {
5617 double global_imbalance = 0;
5619 size_t global_shift = num_procs * num_parts;
5621 for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5622 for(mj_part_t i = 0; i < num_parts; ++i) {
5623 double ideal_num = num_points_in_all_processor_parts[global_shift + i]
5624 / double(num_procs);
5626 global_imbalance += std::abs(ideal_num -
5627 num_points_in_all_processor_parts[ii * num_parts + i]) / (ideal_num);
5630 global_imbalance /= num_parts;
5631 global_imbalance /= num_procs;
5633 if(global_imbalance <= this->minimum_migration_imbalance) {
5659 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5660 typename mj_part_t,
typename mj_node_t>
5661 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5662 assign_send_destinations(
5663 mj_part_t num_parts,
5664 mj_part_t *part_assignment_proc_begin_indices,
5665 mj_part_t *processor_chains_in_parts,
5666 mj_lno_t *send_count_to_each_proc,
5667 int *coordinate_destinations) {
5669 auto host_new_part_xadj = Kokkos::create_mirror_view(this->new_part_xadj);
5670 deep_copy(host_new_part_xadj, this->new_part_xadj);
5672 auto host_new_coordinate_permutations =
5673 Kokkos::create_mirror_view(this->new_coordinate_permutations);
5674 deep_copy(host_new_coordinate_permutations, this->new_coordinate_permutations);
5676 for(mj_part_t p = 0; p < num_parts; ++p) {
5677 mj_lno_t part_begin = 0;
5678 if(p > 0) part_begin = host_new_part_xadj(p - 1);
5679 mj_lno_t part_end = host_new_part_xadj(p);
5681 mj_part_t proc_to_sent = part_assignment_proc_begin_indices[p];
5683 mj_lno_t num_total_send = 0;
5684 for(mj_lno_t j=part_begin; j < part_end; j++) {
5685 mj_lno_t local_ind = host_new_coordinate_permutations(j);
5686 while (num_total_send >= send_count_to_each_proc[proc_to_sent]) {
5690 part_assignment_proc_begin_indices[p] =
5691 processor_chains_in_parts[proc_to_sent];
5693 processor_chains_in_parts[proc_to_sent] = -1;
5695 proc_to_sent = part_assignment_proc_begin_indices[p];
5698 coordinate_destinations[local_ind] = proc_to_sent;
5724 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5725 typename mj_part_t,
typename mj_node_t>
5726 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5727 mj_assign_proc_to_parts(
5728 mj_gno_t * num_points_in_all_processor_parts,
5729 mj_part_t num_parts,
5730 mj_part_t num_procs,
5731 mj_lno_t *send_count_to_each_proc,
5732 std::vector<mj_part_t> &processor_ranks_for_subcomm,
5733 std::vector<mj_part_t> *next_future_num_parts_in_parts,
5734 mj_part_t &out_part_index,
5735 mj_part_t &output_part_numbering_begin_index,
5736 int * coordinate_destinations) {
5737 mj_gno_t *global_num_points_in_parts =
5738 num_points_in_all_processor_parts + num_procs * num_parts;
5739 mj_part_t *num_procs_assigned_to_each_part =
new mj_part_t[num_parts];
5742 bool did_i_find_my_group =
false;
5744 mj_part_t num_free_procs = num_procs;
5745 mj_part_t minimum_num_procs_required_for_rest_of_parts = num_parts - 1;
5747 double max_imbalance_difference = 0;
5748 mj_part_t max_differing_part = 0;
5751 for(mj_part_t i = 0; i < num_parts; i++) {
5754 double scalar_required_proc = num_procs *
5755 (double (global_num_points_in_parts[i]) /
5756 double (this->num_global_coords));
5759 mj_part_t required_proc =
5760 static_cast<mj_part_t
> (0.5 + scalar_required_proc);
5761 if(required_proc == 0) required_proc = 1;
5767 required_proc < minimum_num_procs_required_for_rest_of_parts) {
5768 required_proc = num_free_procs -
5769 (minimum_num_procs_required_for_rest_of_parts);
5773 num_free_procs -= required_proc;
5777 --minimum_num_procs_required_for_rest_of_parts;
5780 num_procs_assigned_to_each_part[i] = required_proc;
5785 double imbalance_wrt_ideal =
5786 (scalar_required_proc - required_proc) / required_proc;
5787 if(imbalance_wrt_ideal > max_imbalance_difference) {
5788 max_imbalance_difference = imbalance_wrt_ideal;
5789 max_differing_part = i;
5795 if(num_free_procs > 0) {
5796 num_procs_assigned_to_each_part[max_differing_part] += num_free_procs;
5803 mj_part_t *part_assignment_proc_begin_indices =
new mj_part_t[num_parts];
5807 mj_part_t *processor_chains_in_parts =
new mj_part_t [num_procs];
5808 mj_part_t *processor_part_assignments =
new mj_part_t[num_procs];
5817 for(
int i = 0; i < num_procs; ++i ) {
5818 processor_part_assignments[i] = -1;
5819 processor_chains_in_parts[i] = -1;
5821 for(
int i = 0; i < num_parts; ++i ) {
5822 part_assignment_proc_begin_indices[i] = -1;
5828 uSignedSortItem<mj_part_t, mj_gno_t, char> *
5829 sort_item_num_part_points_in_procs =
5830 new uSignedSortItem<mj_part_t, mj_gno_t, char>[num_procs];
5832 for(mj_part_t i = 0; i < num_parts; ++i) {
5837 for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5838 sort_item_num_part_points_in_procs[ii].id = ii;
5841 if(processor_part_assignments[ii] == -1) {
5842 sort_item_num_part_points_in_procs[ii].val =
5843 num_points_in_all_processor_parts[ii * num_parts + i];
5845 sort_item_num_part_points_in_procs[ii].signbit = 1;
5858 sort_item_num_part_points_in_procs[ii].val =
5859 num_points_in_all_processor_parts[ii * num_parts + i];
5860 sort_item_num_part_points_in_procs[ii].signbit = 0;
5865 uqSignsort<mj_part_t, mj_gno_t,char>
5866 (num_procs, sort_item_num_part_points_in_procs);
5878 mj_part_t required_proc_count = num_procs_assigned_to_each_part[i];
5879 mj_gno_t total_num_points_in_part = global_num_points_in_parts[i];
5880 mj_gno_t ideal_num_points_in_a_proc = Teuchos::as<mj_gno_t>(
5881 ceil(total_num_points_in_part /
double (required_proc_count)));
5884 mj_part_t next_proc_to_send_index = num_procs - required_proc_count;
5885 mj_part_t next_proc_to_send_id =
5886 sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
5887 mj_lno_t space_left_in_sent_proc = ideal_num_points_in_a_proc -
5888 sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
5892 for(mj_part_t ii = num_procs - 1;
5893 ii >= num_procs - required_proc_count; --ii) {
5894 mj_part_t proc_id = sort_item_num_part_points_in_procs[ii].id;
5896 processor_part_assignments[proc_id] = i;
5899 bool did_change_sign =
false;
5901 for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5904 if(sort_item_num_part_points_in_procs[ii].signbit == 0) {
5905 did_change_sign =
true;
5906 sort_item_num_part_points_in_procs[ii].signbit = 1;
5913 if(did_change_sign) {
5916 uqSignsort<mj_part_t, mj_gno_t>(num_procs - required_proc_count,
5917 sort_item_num_part_points_in_procs);
5932 if(!did_i_find_my_group) {
5933 for(mj_part_t ii = num_procs - 1; ii >=
5934 num_procs - required_proc_count; --ii) {
5936 mj_part_t proc_id_to_assign = sort_item_num_part_points_in_procs[ii].id;
5939 processor_ranks_for_subcomm.push_back(proc_id_to_assign);
5941 if(proc_id_to_assign == this->myRank) {
5943 did_i_find_my_group =
true;
5946 part_assignment_proc_begin_indices[i] = this->myRank;
5947 processor_chains_in_parts[this->myRank] = -1;
5951 send_count_to_each_proc[this->myRank] =
5952 sort_item_num_part_points_in_procs[ii].val;
5956 for(mj_part_t in = 0; in < i; ++in) {
5957 output_part_numbering_begin_index +=
5958 (*next_future_num_parts_in_parts)[in];
5966 if(!did_i_find_my_group) {
5967 processor_ranks_for_subcomm.clear();
5975 for(mj_part_t ii = num_procs - required_proc_count - 1; ii >= 0; --ii) {
5976 mj_part_t nonassigned_proc_id =
5977 sort_item_num_part_points_in_procs[ii].id;
5978 mj_lno_t num_points_to_sent =
5979 sort_item_num_part_points_in_procs[ii].val;
5985 if(num_points_to_sent < 0) {
5986 cout <<
"Migration - processor assignments - for part:" << i
5987 <<
"from proc:" << nonassigned_proc_id <<
" num_points_to_sent:"
5988 << num_points_to_sent << std::endl;
5993 switch (migration_type) {
5997 while (num_points_to_sent > 0) {
5999 if(num_points_to_sent <= space_left_in_sent_proc) {
6001 space_left_in_sent_proc -= num_points_to_sent;
6003 if(this->myRank == nonassigned_proc_id) {
6005 send_count_to_each_proc[next_proc_to_send_id] =
6010 mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
6011 part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
6012 processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
6014 num_points_to_sent = 0;
6018 if(space_left_in_sent_proc > 0) {
6019 num_points_to_sent -= space_left_in_sent_proc;
6022 if(this->myRank == nonassigned_proc_id) {
6024 send_count_to_each_proc[next_proc_to_send_id] =
6025 space_left_in_sent_proc;
6026 mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
6027 part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
6028 processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
6032 ++next_proc_to_send_index;
6035 if(next_part_to_send_index < nprocs - required_proc_count ) {
6036 cout <<
"Migration - processor assignments - for part:"
6038 <<
" next_part_to_send :" << next_part_to_send_index
6039 <<
" nprocs:" << nprocs
6040 <<
" required_proc_count:" << required_proc_count
6041 <<
" Error: next_part_to_send_index <" <<
6042 <<
" nprocs - required_proc_count" << std::endl;
6047 next_proc_to_send_id =
6048 sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
6050 space_left_in_sent_proc = ideal_num_points_in_a_proc -
6051 sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
6062 if(this->myRank == nonassigned_proc_id) {
6064 send_count_to_each_proc[next_proc_to_send_id] = num_points_to_sent;
6068 mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
6069 part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
6070 processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
6072 num_points_to_sent = 0;
6073 ++next_proc_to_send_index;
6077 if(next_proc_to_send_index == num_procs) {
6078 next_proc_to_send_index = num_procs - required_proc_count;
6081 next_proc_to_send_id =
6082 sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
6084 space_left_in_sent_proc = ideal_num_points_in_a_proc -
6085 sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
6098 this->assign_send_destinations(
6100 part_assignment_proc_begin_indices,
6101 processor_chains_in_parts,
6102 send_count_to_each_proc,
6103 coordinate_destinations);
6104 delete [] part_assignment_proc_begin_indices;
6105 delete [] processor_chains_in_parts;
6106 delete [] processor_part_assignments;
6107 delete [] sort_item_num_part_points_in_procs;
6108 delete [] num_procs_assigned_to_each_part;
6126 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6127 typename mj_part_t,
typename mj_node_t>
6128 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6129 assign_send_destinations2(
6130 mj_part_t num_parts,
6131 uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment,
6132 int *coordinate_destinations,
6133 mj_part_t &output_part_numbering_begin_index,
6134 std::vector<mj_part_t> *next_future_num_parts_in_parts)
6136 mj_part_t part_shift_amount = output_part_numbering_begin_index;
6137 mj_part_t previous_processor = -1;
6139 auto local_new_part_xadj = Kokkos::create_mirror_view(this->new_part_xadj);
6140 Kokkos::deep_copy(local_new_part_xadj, this->new_part_xadj);
6142 auto local_new_coordinate_permutations =
6143 Kokkos::create_mirror_view(this->new_coordinate_permutations);
6144 Kokkos::deep_copy(local_new_coordinate_permutations,
6145 this->new_coordinate_permutations);
6147 for(mj_part_t i = 0; i < num_parts; ++i) {
6148 mj_part_t p = sort_item_part_to_proc_assignment[i].id;
6151 mj_lno_t part_begin_index = 0;
6154 part_begin_index = local_new_part_xadj(p - 1);
6157 mj_lno_t part_end_index = local_new_part_xadj(p);
6159 mj_part_t assigned_proc = sort_item_part_to_proc_assignment[i].val;
6160 if(this->myRank == assigned_proc && previous_processor != assigned_proc) {
6161 output_part_numbering_begin_index = part_shift_amount;
6163 previous_processor = assigned_proc;
6164 part_shift_amount += (*next_future_num_parts_in_parts)[p];
6166 for(mj_lno_t j= part_begin_index; j < part_end_index; j++) {
6167 mj_lno_t localInd = local_new_coordinate_permutations(j);
6168 coordinate_destinations[localInd] = assigned_proc;
6194 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6195 typename mj_part_t,
typename mj_node_t>
6196 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6197 mj_assign_parts_to_procs(
6198 mj_gno_t * num_points_in_all_processor_parts,
6199 mj_part_t num_parts,
6200 mj_part_t num_procs,
6201 mj_lno_t *send_count_to_each_proc,
6202 std::vector<mj_part_t> *next_future_num_parts_in_parts,
6203 mj_part_t &out_num_part,
6204 std::vector<mj_part_t> &out_part_indices,
6205 mj_part_t &output_part_numbering_begin_index,
6206 int *coordinate_destinations) {
6209 mj_gno_t *global_num_points_in_parts =
6210 num_points_in_all_processor_parts + num_procs * num_parts;
6211 out_part_indices.clear();
6215 uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment =
6216 new uSortItem<mj_part_t, mj_part_t>[num_parts];
6217 uSortItem<mj_part_t, mj_gno_t> * sort_item_num_points_of_proc_in_part_i =
6218 new uSortItem<mj_part_t, mj_gno_t>[num_procs];
6222 mj_lno_t work_each =
6223 mj_lno_t (this->num_global_coords / (
double (num_procs)) + 0.5f);
6227 mj_lno_t *space_in_each_processor =
new mj_lno_t[num_procs];
6230 for(mj_part_t i = 0; i < num_procs; ++i) {
6231 space_in_each_processor[i] = work_each;
6238 mj_part_t *num_parts_proc_assigned =
new mj_part_t[num_procs];
6239 memset(num_parts_proc_assigned, 0,
sizeof(mj_part_t) * num_procs);
6240 int empty_proc_count = num_procs;
6244 uSortItem<mj_part_t, mj_gno_t> * sort_item_point_counts_in_parts =
6245 new uSortItem<mj_part_t, mj_gno_t>[num_parts];
6250 for(mj_part_t i = 0; i < num_parts; ++i) {
6251 sort_item_point_counts_in_parts[i].id = i;
6252 sort_item_point_counts_in_parts[i].val = global_num_points_in_parts[i];
6256 uqsort<mj_part_t, mj_gno_t>(num_parts, sort_item_point_counts_in_parts);
6261 for(mj_part_t j = 0; j < num_parts; ++j) {
6263 mj_part_t i = sort_item_point_counts_in_parts[num_parts - 1 - j].id;
6266 mj_gno_t load = global_num_points_in_parts[i];
6269 mj_part_t assigned_proc = -1;
6272 for(mj_part_t ii = 0; ii < num_procs; ++ii) {
6273 sort_item_num_points_of_proc_in_part_i[ii].id = ii;
6278 if(empty_proc_count < num_parts - j ||
6279 num_parts_proc_assigned[ii] == 0) {
6281 sort_item_num_points_of_proc_in_part_i[ii].val =
6282 num_points_in_all_processor_parts[ii * num_parts + i];
6285 sort_item_num_points_of_proc_in_part_i[ii].val = -1;
6289 uqsort<mj_part_t, mj_gno_t>(num_procs,
6290 sort_item_num_points_of_proc_in_part_i);
6293 for(mj_part_t iii = num_procs - 1; iii >= 0; --iii) {
6294 mj_part_t ii = sort_item_num_points_of_proc_in_part_i[iii].id;
6295 if(assigned_proc == -1 ||
6296 (space_in_each_processor[ii] > space_in_each_processor[assigned_proc])) {
6299 else if(space_in_each_processor[ii] == space_in_each_processor[assigned_proc]) {
6300 if(ii < assigned_proc) {
6316 if(num_parts_proc_assigned[assigned_proc]++ == 0) {
6320 space_in_each_processor[assigned_proc] -= load;
6322 sort_item_part_to_proc_assignment[j].id = i;
6325 sort_item_part_to_proc_assignment[j].val = assigned_proc;
6328 if(assigned_proc == this->myRank) {
6330 out_part_indices.push_back(i);
6336 send_count_to_each_proc[assigned_proc] +=
6337 num_points_in_all_processor_parts[this->myRank * num_parts + i];
6340 delete [] num_parts_proc_assigned;
6341 delete [] sort_item_num_points_of_proc_in_part_i;
6342 delete [] sort_item_point_counts_in_parts;
6343 delete [] space_in_each_processor;
6346 uqsort<mj_part_t, mj_part_t>(num_parts, sort_item_part_to_proc_assignment);
6349 this->assign_send_destinations2(
6351 sort_item_part_to_proc_assignment,
6352 coordinate_destinations,
6353 output_part_numbering_begin_index,
6354 next_future_num_parts_in_parts);
6356 delete [] sort_item_part_to_proc_assignment;
6383 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6384 typename mj_part_t,
typename mj_node_t>
6385 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6386 mj_migration_part_proc_assignment(
6387 mj_gno_t * num_points_in_all_processor_parts,
6388 mj_part_t num_parts,
6389 mj_part_t num_procs,
6390 mj_lno_t *send_count_to_each_proc,
6391 std::vector<mj_part_t> &processor_ranks_for_subcomm,
6392 std::vector<mj_part_t> *next_future_num_parts_in_parts,
6393 mj_part_t &out_num_part,
6394 std::vector<mj_part_t> &out_part_indices,
6395 mj_part_t &output_part_numbering_begin_index,
6396 int *coordinate_destinations)
6398 processor_ranks_for_subcomm.clear();
6400 if(num_procs > num_parts) {
6405 mj_part_t out_part_index = 0;
6407 this->mj_assign_proc_to_parts(
6408 num_points_in_all_processor_parts,
6411 send_count_to_each_proc,
6412 processor_ranks_for_subcomm,
6413 next_future_num_parts_in_parts,
6415 output_part_numbering_begin_index,
6416 coordinate_destinations
6420 out_part_indices.clear();
6421 out_part_indices.push_back(out_part_index);
6428 processor_ranks_for_subcomm.push_back(this->myRank);
6433 this->mj_assign_parts_to_procs(
6434 num_points_in_all_processor_parts,
6437 send_count_to_each_proc,
6438 next_future_num_parts_in_parts,
6441 output_part_numbering_begin_index,
6442 coordinate_destinations);
6459 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6460 typename mj_part_t,
typename mj_node_t>
6461 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6463 mj_part_t num_procs,
6464 mj_lno_t &num_new_local_points,
6465 std::string iteration,
6466 int *coordinate_destinations,
6467 mj_part_t num_parts)
6470 #ifdef ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
6471 if(
sizeof(mj_lno_t) <=
sizeof(
int)) {
6475 ZOLTAN_COMM_OBJ *plan = NULL;
6476 MPI_Comm mpi_comm = Teuchos::getRawMpiComm(*(this->comm));
6477 int num_incoming_gnos = 0;
6478 int message_tag = 7859;
6481 mj_timer_base_string +
"Migration Z1PlanCreating-" + iteration);
6482 int ierr = Zoltan_Comm_Create(
6484 int(this->num_local_coords),
6485 coordinate_destinations,
6488 &num_incoming_gnos);
6492 mj_timer_base_string +
"Migration Z1PlanCreating-" + iteration);
6495 mj_timer_base_string +
"Migration Z1Migration-" + iteration);
6503 auto host_current_mj_gnos = Kokkos::create_mirror_view(
6504 Kokkos::HostSpace(), this->current_mj_gnos);
6505 Kokkos::deep_copy(host_current_mj_gnos, this->current_mj_gnos);
6506 Kokkos::View<mj_gno_t*, device_t> dst_gnos(
6507 Kokkos::ViewAllocateWithoutInitializing(
"dst_gnos"), num_incoming_gnos);
6508 auto host_dst_gnos = Kokkos::create_mirror_view(
6509 Kokkos::HostSpace(), dst_gnos);
6511 ierr = Zoltan_Comm_Do(
6514 (
char *) host_current_mj_gnos.data(),
6516 (
char *) host_dst_gnos.data());
6518 Kokkos::deep_copy(dst_gnos, host_dst_gnos);
6519 this->current_mj_gnos = dst_gnos;
6525 auto host_src_coordinates = Kokkos::create_mirror_view(
6526 Kokkos::HostSpace(), this->mj_coordinates);
6527 Kokkos::deep_copy(host_src_coordinates, this->mj_coordinates);
6528 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
6529 dst_coordinates(Kokkos::ViewAllocateWithoutInitializing(
"mj_coordinates"),
6530 num_incoming_gnos, this->coord_dim);
6531 auto host_dst_coordinates = Kokkos::create_mirror_view(
6532 Kokkos::HostSpace(), dst_coordinates);
6533 for(
int i = 0; i < this->coord_dim; ++i) {
6534 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> sub_host_src_coordinates
6535 = Kokkos::subview(host_src_coordinates, Kokkos::ALL, i);
6536 Kokkos::View<mj_scalar_t *, Kokkos::HostSpace> sub_host_dst_coordinates
6537 = Kokkos::subview(host_dst_coordinates, Kokkos::ALL, i);
6540 ierr = Zoltan_Comm_Do(
6543 (
char *) sub_host_src_coordinates.data(),
6544 sizeof(mj_scalar_t),
6545 (
char *) sub_host_dst_coordinates.data());
6548 deep_copy(dst_coordinates, host_dst_coordinates);
6549 this->mj_coordinates = dst_coordinates;
6554 auto host_src_weights = Kokkos::create_mirror_view(
6555 Kokkos::HostSpace(), this->mj_weights);
6556 Kokkos::deep_copy(host_src_weights, this->mj_weights);
6557 Kokkos::View<mj_scalar_t**, device_t> dst_weights(
6558 Kokkos::ViewAllocateWithoutInitializing(
"mj_weights"),
6559 num_incoming_gnos, this->num_weights_per_coord);
6560 auto host_dst_weights = Kokkos::create_mirror_view(dst_weights);
6561 for(
int i = 0; i < this->num_weights_per_coord; ++i) {
6562 auto sub_host_src_weights
6563 = Kokkos::subview(host_src_weights, Kokkos::ALL, i);
6564 auto sub_host_dst_weights
6565 = Kokkos::subview(host_dst_weights, Kokkos::ALL, i);
6566 ArrayRCP<mj_scalar_t> sent_weight(this->num_local_coords);
6568 for(mj_lno_t n = 0; n < this->num_local_coords; ++n) {
6569 sent_weight[n] = sub_host_src_weights(n);
6571 ArrayRCP<mj_scalar_t> received_weight(num_incoming_gnos);
6573 ierr = Zoltan_Comm_Do(
6576 (
char *) sent_weight.getRawPtr(),
6577 sizeof(mj_scalar_t),
6578 (
char *) received_weight.getRawPtr());
6581 for(mj_lno_t n = 0; n < num_incoming_gnos; ++n) {
6582 sub_host_dst_weights(n) = received_weight[n];
6585 deep_copy(dst_weights, host_dst_weights);
6586 this->mj_weights = dst_weights;
6592 Kokkos::View<int *, Kokkos::HostSpace> dst_owners_of_coordinate(
6593 Kokkos::ViewAllocateWithoutInitializing(
"owner_of_coordinate"),
6596 ierr = Zoltan_Comm_Do(
6599 (
char *) owner_of_coordinate.data(),
6601 (
char *) dst_owners_of_coordinate.data());
6603 this->owner_of_coordinate = dst_owners_of_coordinate;
6610 auto host_src_assigned_part_ids = Kokkos::create_mirror_view(
6611 Kokkos::HostSpace(), this->assigned_part_ids);
6612 Kokkos::deep_copy(host_src_assigned_part_ids, this->assigned_part_ids);
6613 Kokkos::View<int *, device_t> dst_assigned_part_ids(
6614 Kokkos::ViewAllocateWithoutInitializing(
"assigned_part_ids"),
6616 auto host_dst_assigned_part_ids = Kokkos::create_mirror_view(
6617 Kokkos::HostSpace(), dst_assigned_part_ids);
6618 mj_part_t *new_parts =
new mj_part_t[num_incoming_gnos];
6619 if(num_procs < num_parts) {
6621 ierr = Zoltan_Comm_Do(
6624 (
char *) host_src_assigned_part_ids.data(),
6626 (
char *) host_dst_assigned_part_ids.data());
6628 Kokkos::deep_copy(dst_assigned_part_ids, host_dst_assigned_part_ids);
6632 this->assigned_part_ids = dst_assigned_part_ids;
6635 ierr = Zoltan_Comm_Destroy(&plan);
6637 num_new_local_points = num_incoming_gnos;
6639 mj_timer_base_string +
"Migration Z1Migration-" + iteration);
6642 #endif // ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
6644 this->mj_env->timerStart(
MACRO_TIMERS, mj_timer_base_string +
6645 "Migration DistributorPlanCreating-" + iteration);
6647 Tpetra::Distributor distributor(this->comm);
6648 ArrayView<const mj_part_t> destinations( coordinate_destinations,
6649 this->num_local_coords);
6650 mj_lno_t num_incoming_gnos = distributor.createFromSends(destinations);
6651 this->mj_env->timerStop(
MACRO_TIMERS, mj_timer_base_string +
6652 "Migration DistributorPlanCreating-" + iteration);
6653 this->mj_env->timerStart(
MACRO_TIMERS, mj_timer_base_string +
6654 "Migration DistributorMigration-" + iteration);
6662 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> received_gnos(
6663 Kokkos::ViewAllocateWithoutInitializing(
"received_gnos"),
6666 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> sent_gnos(
6667 Kokkos::ViewAllocateWithoutInitializing(
"sent_gnos"),
6668 this->current_mj_gnos.extent(0));
6669 Kokkos::deep_copy(sent_gnos, this->current_mj_gnos);
6671 distributor.doPostsAndWaits(sent_gnos, 1, received_gnos);
6673 this->current_mj_gnos = Kokkos::View<mj_gno_t*, device_t>(
6674 Kokkos::ViewAllocateWithoutInitializing(
"gids"), num_incoming_gnos);
6676 Kokkos::deep_copy(this->current_mj_gnos, received_gnos);
6681 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
6682 dst_coordinates(
"mj_coordinates", num_incoming_gnos, this->coord_dim);
6684 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, Kokkos::HostSpace>
6685 host_src_coordinates(
6686 Kokkos::ViewAllocateWithoutInitializing(
"host_coords"),
6687 this->mj_coordinates.extent(0), this->mj_coordinates.extent(1));
6688 Kokkos::deep_copy(host_src_coordinates, this->mj_coordinates);
6690 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> received_coord(
6691 Kokkos::ViewAllocateWithoutInitializing(
"received_coord"),
6694 for(
int i = 0; i < this->coord_dim; ++i) {
6698 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> sent_coord
6699 = Kokkos::subview(host_src_coordinates, Kokkos::ALL, i);
6701 distributor.doPostsAndWaits(sent_coord, 1, received_coord);
6703 Kokkos::deep_copy(Kokkos::subview(dst_coordinates, Kokkos::ALL, i),
6709 this->mj_coordinates = dst_coordinates;
6712 Kokkos::View<mj_scalar_t**, device_t> dst_weights(
6713 "mj_weights", num_incoming_gnos, this->num_weights_per_coord);
6714 auto host_dst_weights = Kokkos::create_mirror_view(Kokkos::HostSpace(),
6717 auto host_src_weights = Kokkos::create_mirror_view_and_copy(
6718 Kokkos::HostSpace(), this->mj_weights);
6721 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> sent_weight(
6722 Kokkos::ViewAllocateWithoutInitializing(
"send_weight_buffer"),
6723 this->num_local_coords);
6725 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> received_weight(
6726 Kokkos::ViewAllocateWithoutInitializing(
"received_weight_buffer"),
6729 for(
int i = 0; i < this->num_weights_per_coord; ++i) {
6731 auto sub_host_src_weights
6732 = Kokkos::subview(host_src_weights, Kokkos::ALL, i);
6734 auto sub_host_dst_weights
6735 = Kokkos::subview(host_dst_weights, Kokkos::ALL, i);
6742 for(mj_lno_t n = 0; n < this->num_local_coords; ++n) {
6743 sent_weight[n] = sub_host_src_weights(n);
6746 distributor.doPostsAndWaits(sent_weight, 1, received_weight);
6749 for(mj_lno_t n = 0; n < num_incoming_gnos; ++n) {
6750 sub_host_dst_weights(n) = received_weight[n];
6753 Kokkos::deep_copy(dst_weights, host_dst_weights);
6754 this->mj_weights = dst_weights;
6759 Kokkos::View<int *, Kokkos::HostSpace> received_owners(
6760 Kokkos::ViewAllocateWithoutInitializing(
"owner_of_coordinate"),
6763 distributor.doPostsAndWaits(owner_of_coordinate, 1, received_owners);
6765 this->owner_of_coordinate = received_owners;
6771 if(num_procs < num_parts) {
6772 Kokkos::View<mj_part_t*, Kokkos::HostSpace> sent_partids(
6773 Kokkos::ViewAllocateWithoutInitializing(
"host_parts"),
6774 this->assigned_part_ids.extent(0));
6775 Kokkos::deep_copy(sent_partids, assigned_part_ids);
6777 Kokkos::View<mj_part_t*, Kokkos::HostSpace> received_partids(
6778 Kokkos::ViewAllocateWithoutInitializing(
"received_partids"),
6781 distributor.doPostsAndWaits(sent_partids, 1, received_partids);
6783 this->assigned_part_ids = Kokkos::View<mj_part_t *, device_t>
6784 (
"assigned_part_ids", num_incoming_gnos);
6785 Kokkos::deep_copy(this->assigned_part_ids, received_partids);
6788 this->assigned_part_ids = Kokkos::View<mj_part_t *, device_t>
6789 (
"assigned_part_ids", num_incoming_gnos);
6791 this->mj_env->timerStop(
MACRO_TIMERS,
"" + mj_timer_base_string +
6792 "Migration DistributorMigration-" + iteration);
6794 num_new_local_points = num_incoming_gnos;
6803 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6804 typename mj_part_t,
typename mj_node_t>
6805 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6806 create_sub_communicator(std::vector<mj_part_t> &processor_ranks_for_subcomm)
6808 mj_part_t group_size = processor_ranks_for_subcomm.size();
6809 mj_part_t *ids =
new mj_part_t[group_size];
6810 for(mj_part_t i = 0; i < group_size; ++i) {
6811 ids[i] = processor_ranks_for_subcomm[i];
6813 ArrayView<const mj_part_t> idView(ids, group_size);
6814 this->comm = this->comm->createSubcommunicator(idView);
6823 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6824 typename mj_part_t,
typename mj_node_t>
6825 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6826 fill_permutation_array(
6827 mj_part_t output_num_parts,
6828 mj_part_t num_parts)
6831 if(output_num_parts == 1) {
6832 auto local_new_coordinate_permutations = this->new_coordinate_permutations;
6833 Kokkos::parallel_for(
6834 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
6835 (0, this->num_local_coords),
6836 KOKKOS_LAMBDA(mj_lno_t i) {
6837 local_new_coordinate_permutations(i) = i;
6839 auto local_new_part_xadj = this->new_part_xadj;
6840 auto local_num_local_coords = this->num_local_coords;
6841 Kokkos::parallel_for(
6842 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0,1),
6843 KOKKOS_LAMBDA(
int dummy) {
6844 local_new_part_xadj(0) = local_num_local_coords;
6848 auto local_num_local_coords = this->num_local_coords;
6849 auto local_assigned_part_ids = this->assigned_part_ids;
6850 auto local_new_part_xadj = this->new_part_xadj;
6851 auto local_new_coordinate_permutations = this->new_coordinate_permutations;
6854 Kokkos::View<mj_part_t*, device_t> part_shifts(
"part_shifts", num_parts);
6859 Kokkos::View<mj_lno_t*, device_t> num_points_in_parts(
6860 "num_points_in_parts", num_parts);
6862 Kokkos::parallel_for(
6863 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0,1),
6864 KOKKOS_LAMBDA(
int dummy) {
6866 for(mj_lno_t i = 0; i < local_num_local_coords; ++i) {
6867 mj_part_t ii = local_assigned_part_ids(i);
6868 ++num_points_in_parts(ii);
6873 mj_lno_t prev_index = 0;
6874 for(mj_part_t i = 0; i < num_parts; ++i) {
6875 if(num_points_in_parts(i) > 0) {
6876 local_new_part_xadj(p) = prev_index + num_points_in_parts(i);
6877 prev_index += num_points_in_parts(i);
6878 part_shifts(i) = p++;
6883 mj_part_t assigned_num_parts = p - 1;
6884 for(;p < num_parts; ++p) {
6885 local_new_part_xadj(p) =
6886 local_new_part_xadj(assigned_num_parts);
6888 for(mj_part_t i = 0; i < output_num_parts; ++i) {
6889 num_points_in_parts(i) = local_new_part_xadj(i);
6895 for(mj_lno_t i = local_num_local_coords - 1; i >= 0; --i) {
6897 part_shifts[mj_part_t(local_assigned_part_ids(i))];
6898 local_new_coordinate_permutations(--num_points_in_parts[part]) = i;
6928 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6929 typename mj_part_t,
typename mj_node_t>
6930 bool AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6931 mj_perform_migration(
6932 mj_part_t input_num_parts,
6933 mj_part_t &output_num_parts,
6934 std::vector<mj_part_t> *next_future_num_parts_in_parts,
6935 mj_part_t &output_part_begin_index,
6936 size_t migration_reduce_all_population,
6937 mj_lno_t num_coords_for_last_dim_part,
6938 std::string iteration,
6939 RCP<mj_partBoxVector_t> &input_part_boxes,
6940 RCP<mj_partBoxVector_t> &output_part_boxes)
6942 mj_part_t num_procs = this->comm->getSize();
6943 this->myRank = this->comm->getRank();
6948 mj_gno_t *num_points_in_all_processor_parts =
6949 new mj_gno_t[input_num_parts * (num_procs + 1)];
6952 this->get_processor_num_points_in_parts(
6955 num_points_in_all_processor_parts);
6958 if(!this->mj_check_to_migrate(
6959 migration_reduce_all_population,
6960 num_coords_for_last_dim_part,
6963 num_points_in_all_processor_parts)) {
6964 delete [] num_points_in_all_processor_parts;
6968 mj_lno_t *send_count_to_each_proc = NULL;
6969 int *coordinate_destinations =
new int[this->num_local_coords];
6970 send_count_to_each_proc =
new mj_lno_t[num_procs];
6972 for(
int i = 0; i < num_procs; ++i) {
6973 send_count_to_each_proc[i] = 0;
6976 std::vector<mj_part_t> processor_ranks_for_subcomm;
6977 std::vector<mj_part_t> out_part_indices;
6980 this->mj_migration_part_proc_assignment(
6981 num_points_in_all_processor_parts,
6984 send_count_to_each_proc,
6985 processor_ranks_for_subcomm,
6986 next_future_num_parts_in_parts,
6989 output_part_begin_index,
6990 coordinate_destinations);
6992 delete [] send_count_to_each_proc;
6993 std::vector <mj_part_t> tmpv;
6995 std::sort (out_part_indices.begin(), out_part_indices.end());
6996 mj_part_t outP = out_part_indices.size();
6997 mj_gno_t new_global_num_points = 0;
6998 mj_gno_t *global_num_points_in_parts =
6999 num_points_in_all_processor_parts + num_procs * input_num_parts;
7001 if(this->mj_keep_part_boxes) {
7002 input_part_boxes->clear();
7007 for(mj_part_t i = 0; i < outP; ++i) {
7008 mj_part_t ind = out_part_indices[i];
7009 new_global_num_points += global_num_points_in_parts[ind];
7010 tmpv.push_back((*next_future_num_parts_in_parts)[ind]);
7011 if(this->mj_keep_part_boxes) {
7012 input_part_boxes->push_back((*output_part_boxes)[ind]);
7017 if(this->mj_keep_part_boxes) {
7018 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
7019 input_part_boxes = output_part_boxes;
7020 output_part_boxes = tmpPartBoxes;
7022 next_future_num_parts_in_parts->clear();
7023 for(mj_part_t i = 0; i < outP; ++i) {
7024 mj_part_t p = tmpv[i];
7025 next_future_num_parts_in_parts->push_back(p);
7028 delete [] num_points_in_all_processor_parts;
7030 mj_lno_t num_new_local_points = 0;
7032 this->mj_migrate_coords(
7034 num_new_local_points,
7036 coordinate_destinations,
7039 delete [] coordinate_destinations;
7040 if(this->num_local_coords != num_new_local_points) {
7041 this->new_coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>
7042 (Kokkos::ViewAllocateWithoutInitializing(
"new_coordinate_permutations"),
7043 num_new_local_points);
7044 this->coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>
7045 (Kokkos::ViewAllocateWithoutInitializing(
"coordinate_permutations"),
7046 num_new_local_points);
7048 this->num_local_coords = num_new_local_points;
7049 this->num_global_coords = new_global_num_points;
7052 this->create_sub_communicator(processor_ranks_for_subcomm);
7054 processor_ranks_for_subcomm.clear();
7057 this->fill_permutation_array(output_num_parts, input_num_parts);
7080 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
7081 typename mj_part_t,
typename mj_node_t>
7082 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
7083 create_consistent_chunks(
7084 mj_part_t num_parts,
7085 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
7086 Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
7087 mj_lno_t coordinate_begin,
7088 mj_lno_t coordinate_end,
7089 Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
7090 Kokkos::View<mj_lno_t *, device_t> & out_part_xadj,
7092 bool longest_dim_part,
7093 uSignedSortItem<int, mj_scalar_t, char> * p_coord_dimension_range_sorted)
7103 mj_part_t no_cuts = num_parts - 1;
7107 if(this->distribute_points_on_cut_lines) {
7108 auto local_thread_cut_line_weight_to_put_left =
7109 this->thread_cut_line_weight_to_put_left;
7110 auto local_thread_part_weight_work =
7111 this->thread_part_weight_work;
7112 auto local_sEpsilon = this->
sEpsilon;
7114 Kokkos::parallel_for(
7115 Kokkos::RangePolicy<
typename mj_node_t::execution_space,
7116 mj_part_t> (0, no_cuts), KOKKOS_LAMBDA (mj_part_t i) {
7118 mj_scalar_t left_weight = used_local_cut_line_weight_to_left(i);
7119 if(left_weight > local_sEpsilon) {
7121 mj_scalar_t thread_ii_weight_on_cut =
7122 local_thread_part_weight_work(i * 2 + 1) -
7123 local_thread_part_weight_work(i * 2);
7124 if(thread_ii_weight_on_cut < left_weight) {
7125 local_thread_cut_line_weight_to_put_left(i) =
7126 thread_ii_weight_on_cut;
7129 local_thread_cut_line_weight_to_put_left(i) = left_weight;
7133 local_thread_cut_line_weight_to_put_left(i) = 0;
7138 auto local_least_signifiance = least_signifiance;
7139 auto local_significance_mul = significance_mul;
7140 Kokkos::parallel_for(
7141 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
7142 (0, 1), KOKKOS_LAMBDA (
int dummy) {
7146 for(mj_part_t i = no_cuts - 1; i > 0 ; --i) {
7147 mj_scalar_t cut1 = current_concurrent_cut_coordinate(i-1);
7148 mj_scalar_t cut2 = current_concurrent_cut_coordinate(i);
7149 mj_scalar_t delta = cut2 - cut1;
7150 mj_scalar_t abs_delta = (delta > 0) ? delta : -delta;
7151 if(abs_delta < local_sEpsilon) {
7152 local_thread_cut_line_weight_to_put_left(i) -=
7153 local_thread_cut_line_weight_to_put_left(i - 1);
7155 local_thread_cut_line_weight_to_put_left(i) =
7156 static_cast<long long>((local_thread_cut_line_weight_to_put_left(i) +
7157 local_least_signifiance) * local_significance_mul) /
7158 static_cast<mj_scalar_t
>(local_significance_mul);
7164 auto local_thread_point_counts = this->thread_point_counts;
7165 Kokkos::parallel_for(
7166 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
7167 (0, num_parts), KOKKOS_LAMBDA (mj_part_t i) {
7168 local_thread_point_counts(i) = 0;
7179 mj_part_t *cut_map =
new mj_part_t[no_cuts];
7181 typedef uMultiSortItem<mj_lno_t, int, mj_scalar_t> multiSItem;
7182 typedef std::vector< multiSItem > multiSVector;
7183 typedef std::vector<multiSVector> multiS2Vector;
7186 std::vector<mj_scalar_t *>allocated_memory;
7189 multiS2Vector sort_vector_points_on_cut;
7192 mj_part_t different_cut_count = 1;
7198 multiSVector tmpMultiSVector;
7199 sort_vector_points_on_cut.push_back(tmpMultiSVector);
7201 auto local_current_concurrent_cut_coordinate =
7202 current_concurrent_cut_coordinate;
7203 auto host_current_concurrent_cut_coordinate =
7204 Kokkos::create_mirror_view(local_current_concurrent_cut_coordinate);
7205 Kokkos::deep_copy(host_current_concurrent_cut_coordinate,
7206 local_current_concurrent_cut_coordinate);
7208 for(mj_part_t i = 1; i < no_cuts ; ++i) {
7211 if(std::abs(host_current_concurrent_cut_coordinate(i) -
7212 host_current_concurrent_cut_coordinate(i-1)) < this->sEpsilon) {
7213 cut_map[i] = cut_map[i-1];
7216 cut_map[i] = different_cut_count++;
7217 multiSVector tmp2MultiSVector;
7218 sort_vector_points_on_cut.push_back(tmp2MultiSVector);
7221 Kokkos::deep_copy(current_concurrent_cut_coordinate,
7222 host_current_concurrent_cut_coordinate);
7225 auto host_coordinate_permutations =
7226 Kokkos::create_mirror_view(coordinate_permutations);
7227 Kokkos::deep_copy(host_coordinate_permutations, coordinate_permutations);
7229 auto host_assigned_part_ids = Kokkos::create_mirror_view(assigned_part_ids);
7230 Kokkos::deep_copy(host_assigned_part_ids, assigned_part_ids);
7232 auto host_mj_coordinates = Kokkos::create_mirror_view(mj_coordinates);
7233 Kokkos::deep_copy(host_mj_coordinates, mj_coordinates);
7235 auto host_thread_point_counts = Kokkos::create_mirror_view(thread_point_counts);
7236 Kokkos::deep_copy(host_thread_point_counts, thread_point_counts);
7238 auto local_coord_dim = this->coord_dim;
7240 for(mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii) {
7241 mj_lno_t i = host_coordinate_permutations(ii);
7242 mj_part_t pp = host_assigned_part_ids(i);
7243 mj_part_t p = pp / 2;
7246 mj_scalar_t *vals =
new mj_scalar_t[local_coord_dim -1];
7247 allocated_memory.push_back(vals);
7252 if(longest_dim_part) {
7254 for(
int dim = local_coord_dim - 2; dim >= 0; --dim) {
7257 int next_largest_coord_dim = p_coord_dimension_range_sorted[dim].id;
7262 host_mj_coordinates(i,next_largest_coord_dim);
7266 for(
int dim = coordInd + 1; dim < local_coord_dim; ++dim) {
7267 vals[val_ind++] = host_mj_coordinates(i,dim);
7269 for(
int dim = 0; dim < coordInd; ++dim) {
7270 vals[val_ind++] = host_mj_coordinates(i,dim);
7274 multiSItem tempSortItem(i, local_coord_dim -1, vals);
7276 mj_part_t cmap = cut_map[p];
7277 sort_vector_points_on_cut[cmap].push_back(tempSortItem);
7281 ++host_thread_point_counts(p);
7282 host_assigned_part_ids(i) = p;
7287 for(mj_part_t i = 0; i < different_cut_count; ++i) {
7288 std::sort (sort_vector_points_on_cut[i].begin(),
7289 sort_vector_points_on_cut[i].end());
7292 mj_part_t previous_cut_map = cut_map[0];
7294 auto host_thread_cut_line_weight_to_put_left =
7295 Kokkos::create_mirror_view(thread_cut_line_weight_to_put_left);
7296 Kokkos::deep_copy(host_thread_cut_line_weight_to_put_left,
7297 thread_cut_line_weight_to_put_left);
7299 auto host_mj_weights = Kokkos::create_mirror_view(mj_weights);
7300 Kokkos::deep_copy(host_mj_weights, mj_weights);
7310 mj_scalar_t weight_stolen_from_previous_part = 0;
7311 for(mj_part_t p = 0; p < no_cuts; ++p) {
7312 mj_part_t mapped_cut = cut_map[p];
7316 if(previous_cut_map != mapped_cut) {
7317 mj_lno_t sort_vector_end = (mj_lno_t)
7318 sort_vector_points_on_cut[previous_cut_map].size() - 1;
7319 for(; sort_vector_end >= 0; --sort_vector_end) {
7321 sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
7322 mj_lno_t i = t.index;
7323 ++host_thread_point_counts(p);
7324 host_assigned_part_ids(i) = p;
7326 sort_vector_points_on_cut[previous_cut_map].clear();
7330 mj_lno_t sort_vector_end = (mj_lno_t)
7331 sort_vector_points_on_cut[mapped_cut].size() - 1;
7337 for(; sort_vector_end >= 0; --sort_vector_end) {
7340 multiSItem t = sort_vector_points_on_cut[mapped_cut][sort_vector_end];
7342 mj_lno_t i = t.index;
7343 mj_scalar_t w = this->mj_uniform_weights(0) ? 1 :
7344 this->mj_weights(i,0);
7346 if(host_thread_cut_line_weight_to_put_left(p) +
7347 weight_stolen_from_previous_part> this->sEpsilon &&
7348 host_thread_cut_line_weight_to_put_left(p) +
7349 weight_stolen_from_previous_part -
7350 std::abs(host_thread_cut_line_weight_to_put_left(p) +
7351 weight_stolen_from_previous_part - w)> this->sEpsilon)
7353 host_thread_cut_line_weight_to_put_left(p) -= w;
7355 sort_vector_points_on_cut[mapped_cut].pop_back();
7357 ++host_thread_point_counts(p);
7358 host_assigned_part_ids(i) = p;
7362 if(p < no_cuts - 1 &&
7363 host_thread_cut_line_weight_to_put_left(p) < this->sEpsilon) {
7364 if(mapped_cut == cut_map[p + 1] ) {
7367 if(previous_cut_map != mapped_cut) {
7368 weight_stolen_from_previous_part =
7369 host_thread_cut_line_weight_to_put_left(p);
7374 weight_stolen_from_previous_part +=
7375 host_thread_cut_line_weight_to_put_left(p);
7379 weight_stolen_from_previous_part =
7380 -host_thread_cut_line_weight_to_put_left(p);
7389 if(p < no_cuts - 1 && mapped_cut == cut_map[p + 1]) {
7390 if(previous_cut_map != mapped_cut) {
7391 weight_stolen_from_previous_part =
7392 host_thread_cut_line_weight_to_put_left(p);
7395 weight_stolen_from_previous_part +=
7396 host_thread_cut_line_weight_to_put_left(p);
7400 weight_stolen_from_previous_part =
7401 -host_thread_cut_line_weight_to_put_left(p);
7407 previous_cut_map = mapped_cut;
7412 mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[
7413 previous_cut_map].size() - 1;
7419 for(; sort_vector_end >= 0; --sort_vector_end) {
7421 multiSItem t = sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
7424 mj_lno_t i = t.index;
7425 ++host_thread_point_counts(no_cuts);
7426 host_assigned_part_ids(i) = no_cuts;
7429 sort_vector_points_on_cut[previous_cut_map].clear();
7433 mj_lno_t vSize = (mj_lno_t) allocated_memory.size();
7434 for(mj_lno_t i = 0; i < vSize; ++i) {
7435 delete [] allocated_memory[i];
7438 auto local_out_part_xadj = out_part_xadj;
7439 auto host_out_part_xadj = Kokkos::create_mirror_view(local_out_part_xadj);
7440 Kokkos::deep_copy(host_out_part_xadj, out_part_xadj);
7443 for(mj_part_t j = 0; j < num_parts; ++j) {
7444 host_out_part_xadj(j) = host_thread_point_counts(j);
7445 host_thread_point_counts(j) = 0;
7449 for(mj_part_t j = 1; j < num_parts; ++j) {
7450 host_out_part_xadj(j) += host_out_part_xadj(j - 1);
7455 for(mj_part_t j = 1; j < num_parts; ++j) {
7456 host_thread_point_counts(j) += host_out_part_xadj(j - 1);
7459 auto host_new_coordinate_permutations =
7460 Kokkos::create_mirror_view(new_coordinate_permutations);
7461 Kokkos::deep_copy(host_new_coordinate_permutations,
7462 new_coordinate_permutations);
7466 for(mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii) {
7467 mj_lno_t i = host_coordinate_permutations(ii);
7468 mj_part_t p = host_assigned_part_ids(i);
7469 host_new_coordinate_permutations(coordinate_begin +
7470 host_thread_point_counts(p)++) = i;
7473 Kokkos::deep_copy(thread_point_counts, host_thread_point_counts);
7474 Kokkos::deep_copy(new_coordinate_permutations,
7475 host_new_coordinate_permutations);
7476 Kokkos::deep_copy(local_out_part_xadj, host_out_part_xadj);
7488 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
7489 typename mj_part_t,
typename mj_node_t>
7490 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
7492 mj_part_t current_num_parts,
7493 mj_part_t output_part_begin_index,
7494 RCP<mj_partBoxVector_t> &output_part_boxes,
7495 bool is_data_ever_migrated)
7498 mj_timer_base_string +
"Part_Assignment");
7501 auto local_mj_keep_part_boxes = mj_keep_part_boxes;
7502 auto local_coordinate_permutations = coordinate_permutations;
7503 auto local_assigned_part_ids = assigned_part_ids;
7505 if(local_mj_keep_part_boxes) {
7506 for(
int i = 0; i < current_num_parts; ++i) {
7507 (*output_part_boxes)[i].setpId(i + output_part_begin_index);
7511 Kokkos::TeamPolicy<typename mj_node_t::execution_space> policy(
7512 current_num_parts, Kokkos::AUTO());
7513 typedef typename Kokkos::TeamPolicy<typename mj_node_t::execution_space>
::
7515 Kokkos::parallel_for(policy, KOKKOS_LAMBDA(member_type team_member) {
7516 int i = team_member.league_rank();
7517 Kokkos::parallel_for(Kokkos::TeamThreadRange (team_member, (i != 0) ?
7518 local_part_xadj(i-1) : 0, local_part_xadj(i)),
7520 mj_lno_t k = local_coordinate_permutations(ii);
7521 local_assigned_part_ids(k) = i + output_part_begin_index;
7525 if(is_data_ever_migrated) {
7526 #ifdef ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
7527 if(
sizeof(mj_lno_t) <=
sizeof(int)) {
7534 ZOLTAN_COMM_OBJ *plan = NULL;
7535 MPI_Comm mpi_comm = Teuchos::getRawMpiComm(*(this->mj_problemComm));
7538 int message_tag = 7856;
7541 mj_timer_base_string +
"Final Z1PlanCreating");
7544 int ierr = Zoltan_Comm_Create( &plan,
int(this->num_local_coords),
7545 this->owner_of_coordinate.data(), mpi_comm, message_tag, &incoming);
7549 mj_timer_base_string +
"Final Z1PlanCreating" );
7552 mj_timer_base_string +
"Final Z1PlanComm");
7559 auto host_current_mj_gnos = Kokkos::create_mirror_view(
7560 Kokkos::HostSpace(), this->current_mj_gnos);
7561 deep_copy(host_current_mj_gnos, this->current_mj_gnos);
7562 Kokkos::View<mj_gno_t*, device_t> dst_gnos(
7563 Kokkos::ViewAllocateWithoutInitializing(
"dst_gnos"), incoming);
7564 auto host_dst_gnos = Kokkos::create_mirror_view(
7565 Kokkos::HostSpace(), dst_gnos);
7567 ierr = Zoltan_Comm_Do( plan, message_tag,
7568 (
char *) host_current_mj_gnos.data(),
7569 sizeof(mj_gno_t), (
char *) host_dst_gnos.data());
7571 Kokkos::deep_copy(dst_gnos, host_dst_gnos);
7572 this->current_mj_gnos = dst_gnos;
7575 auto host_src_part_ids = Kokkos::create_mirror_view(
7576 Kokkos::HostSpace(), this->assigned_part_ids);
7577 deep_copy(host_src_part_ids, this->assigned_part_ids);
7578 Kokkos::View<mj_part_t*, device_t> dst_part_ids(
7579 Kokkos::ViewAllocateWithoutInitializing(
"dst_part_ids"), incoming);
7580 auto host_dst_part_ids = Kokkos::create_mirror_view(
7581 Kokkos::HostSpace(), dst_part_ids);
7583 ierr = Zoltan_Comm_Do( plan, message_tag,
7584 (
char *) host_src_part_ids.data(),
7585 sizeof(mj_part_t), (
char *) host_dst_part_ids.data());
7587 Kokkos::deep_copy(dst_part_ids, host_dst_part_ids);
7588 this->assigned_part_ids = dst_part_ids;
7590 ierr = Zoltan_Comm_Destroy(&plan);
7593 this->num_local_coords = incoming;
7596 mj_timer_base_string +
"Final Z1PlanComm");
7599 #endif // ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
7603 mj_timer_base_string +
"Final DistributorPlanCreating");
7604 Tpetra::Distributor distributor(this->mj_problemComm);
7605 ArrayView<const mj_part_t> owners_of_coords(
7606 this->owner_of_coordinate.data(), this->num_local_coords);
7607 mj_lno_t incoming = distributor.createFromSends(owners_of_coords);
7609 mj_timer_base_string +
"Final DistributorPlanCreating" );
7612 mj_timer_base_string +
"Final DistributorPlanComm");
7618 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> sent_gnos(
7619 Kokkos::ViewAllocateWithoutInitializing(
"sent_gnos"),
7620 this->current_mj_gnos.extent(0));
7621 Kokkos::deep_copy(sent_gnos, this->current_mj_gnos);
7623 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> received_gnos(
7624 Kokkos::ViewAllocateWithoutInitializing(
"received_gnos"),
7627 distributor.doPostsAndWaits(sent_gnos, 1, received_gnos);
7629 this->current_mj_gnos = Kokkos::View<mj_gno_t*, device_t>(
7630 Kokkos::ViewAllocateWithoutInitializing(
"current_mj_gnos"), incoming);
7632 Kokkos::deep_copy(this->current_mj_gnos, received_gnos);
7635 Kokkos::View<mj_part_t *, Kokkos::HostSpace> sent_partids(
7636 Kokkos::ViewAllocateWithoutInitializing(
"sent_partids"),
7637 this->assigned_part_ids.extent(0));
7638 Kokkos::deep_copy(sent_partids, this->assigned_part_ids);
7640 Kokkos::View<mj_part_t *, Kokkos::HostSpace> received_partids(
7641 Kokkos::ViewAllocateWithoutInitializing(
"received_partids"),
7644 distributor.doPostsAndWaits(sent_partids, 1, received_partids);
7646 this->assigned_part_ids =
7647 Kokkos::View<mj_part_t*, device_t>(
7648 Kokkos::ViewAllocateWithoutInitializing(
"assigned_part_ids"),
7651 Kokkos::deep_copy(this->assigned_part_ids, received_partids);
7652 this->num_local_coords = incoming;
7655 mj_timer_base_string +
"Final DistributorPlanComm");
7660 mj_timer_base_string +
"Part_Assignment");
7663 mj_timer_base_string +
"Solution_Part_Assignment");
7668 if(this->mj_keep_part_boxes) {
7669 this->kept_boxes = compute_global_box_boundaries(output_part_boxes);
7673 mj_timer_base_string +
"Solution_Part_Assignment");
7688 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
7689 typename mj_part_t,
typename mj_node_t>
7692 bool distribute_points_on_cut_lines_,
7693 int max_concurrent_part_calculation_,
7694 int check_migrate_avoid_migration_option_,
7695 double minimum_migration_imbalance_,
7696 int migration_type_)
7698 this->distribute_points_on_cut_lines = distribute_points_on_cut_lines_;
7699 this->max_concurrent_part_calculation = max_concurrent_part_calculation_;
7700 this->check_migrate_avoid_migration_option =
7701 check_migrate_avoid_migration_option_;
7702 this->minimum_migration_imbalance = minimum_migration_imbalance_;
7703 this->migration_type = migration_type_;
7733 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
7734 typename mj_part_t,
typename mj_node_t>
7737 const RCP<const Environment> &env,
7738 RCP<
const Comm<int> > &problemComm,
7739 double imbalance_tolerance_,
7741 size_t num_global_parts_,
7742 Kokkos::View<mj_part_t*, Kokkos::HostSpace> & part_no_array_,
7743 int recursion_depth_,
7745 mj_lno_t num_local_coords_,
7746 mj_gno_t num_global_coords_,
7747 Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos_,
7749 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> & mj_coordinates_,
7750 int num_weights_per_coord_,
7751 Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_weights_,
7752 Kokkos::View<mj_scalar_t**, device_t> & mj_weights_,
7753 Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_parts_,
7754 Kokkos::View<mj_part_t *, device_t> & result_assigned_part_ids_,
7755 Kokkos::View<mj_gno_t*, device_t> & result_mj_gnos_)
7760 this->mj_timer_base_string =
"MJ(" + std::to_string(execute_counter) +
") - ";
7763 this->mj_problemComm = problemComm;
7764 this->myActualRank = this->myRank = this->mj_problemComm->getRank();
7766 mj_timer_base_string +
"Total");
7767 this->mj_env->debug(3,
"In MultiJagged Jagged");
7768 this->imbalance_tolerance = imbalance_tolerance_;
7769 this->mj_num_teams = num_teams_;
7770 this->num_global_parts = num_global_parts_;
7771 this->part_no_array = part_no_array_;
7772 this->recursion_depth = recursion_depth_;
7773 this->coord_dim = coord_dim_;
7774 this->num_local_coords = num_local_coords_;
7775 this->num_global_coords = num_global_coords_;
7776 this->mj_coordinates = mj_coordinates_;
7777 this->initial_mj_gnos = initial_mj_gnos_;
7778 this->num_weights_per_coord = num_weights_per_coord_;
7779 this->mj_uniform_weights = mj_uniform_weights_;
7780 this->mj_weights = mj_weights_;
7781 this->mj_uniform_parts = mj_uniform_parts_;
7785 this->set_part_specifications();
7788 mj_timer_base_string +
"Allocate Views");
7789 this->allocate_set_work_memory();
7791 mj_timer_base_string +
"Allocate Views");
7795 this->comm = this->mj_problemComm->duplicate();
7798 if(comm->getRank() == 0) {
7799 std::cout <<
"size of gno:" <<
sizeof(mj_gno_t) << std::endl;
7800 std::cout <<
"size of lno:" <<
sizeof(mj_lno_t) << std::endl;
7801 std::cout <<
"size of mj_scalar_t:" <<
sizeof(mj_scalar_t) << std::endl;
7806 mj_part_t current_num_parts = 1;
7807 Kokkos::View<mj_scalar_t *, device_t> current_cut_coordinates =
7808 this->all_cut_coordinates;
7810 mj_timer_base_string +
"Problem_Partitioning");
7811 mj_part_t output_part_begin_index = 0;
7812 mj_part_t future_num_parts = this->total_num_part;
7813 bool is_data_ever_migrated =
false;
7815 std::vector<mj_part_t> *future_num_part_in_parts =
7816 new std::vector<mj_part_t> ();
7817 std::vector<mj_part_t> *next_future_num_parts_in_parts =
7818 new std::vector<mj_part_t> ();
7820 next_future_num_parts_in_parts->push_back(this->num_global_parts);
7822 RCP<mj_partBoxVector_t> input_part_boxes;
7823 RCP<mj_partBoxVector_t> output_part_boxes;
7825 if(this->mj_keep_part_boxes) {
7826 input_part_boxes = RCP<mj_partBoxVector_t>(
new mj_partBoxVector_t(),
true);
7827 output_part_boxes = RCP<mj_partBoxVector_t>(
new mj_partBoxVector_t(),
true);
7828 compute_global_box();
7829 this->init_part_boxes(output_part_boxes);
7836 Kokkos::View<mj_part_t*, device_t>
7837 view_rectilinear_cut_count(
"view_rectilinear_cut_count", 1);
7838 Kokkos::View<size_t*, device_t>
7839 view_total_reduction_size(
"view_total_reduction_size", 1);
7841 for(
int i = 0; i < this->recursion_depth; ++i) {
7844 std::string istring = std::to_string(i);
7850 std::vector<mj_part_t> *tmpPartVect= future_num_part_in_parts;
7851 future_num_part_in_parts = next_future_num_parts_in_parts;
7852 next_future_num_parts_in_parts = tmpPartVect;
7856 next_future_num_parts_in_parts->clear();
7857 if(this->mj_keep_part_boxes) {
7858 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
7859 input_part_boxes = output_part_boxes;
7860 output_part_boxes = tmpPartBoxes;
7861 output_part_boxes->clear();
7865 mj_part_t output_part_count_in_dimension =
7866 this->update_part_num_arrays(
7867 future_num_part_in_parts,
7868 next_future_num_parts_in_parts,
7873 output_part_boxes, 1);
7878 if(output_part_count_in_dimension == current_num_parts) {
7880 tmpPartVect= future_num_part_in_parts;
7881 future_num_part_in_parts = next_future_num_parts_in_parts;
7882 next_future_num_parts_in_parts = tmpPartVect;
7884 if(this->mj_keep_part_boxes) {
7885 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
7886 input_part_boxes = output_part_boxes;
7887 output_part_boxes = tmpPartBoxes;
7893 int coordInd = i % this->coord_dim;
7895 Kokkos::View<mj_scalar_t *, device_t> mj_current_dim_coords =
7896 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
7899 mj_timer_base_string +
"Problem_Partitioning_" + istring);
7903 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
7904 "new part xadj", output_part_count_in_dimension);
7907 mj_part_t output_part_index = 0;
7911 mj_part_t output_coordinate_end_index = 0;
7913 mj_part_t current_work_part = 0;
7914 mj_part_t current_concurrent_num_parts =
7915 std::min(current_num_parts - current_work_part,
7916 this->max_concurrent_part_calculation);
7918 mj_part_t obtained_part_index = 0;
7920 auto host_process_local_min_max_coord_total_weight =
7921 Kokkos::create_mirror_view(process_local_min_max_coord_total_weight);
7922 auto host_global_min_max_coord_total_weight =
7923 Kokkos::create_mirror_view(global_min_max_coord_total_weight);
7926 for(; current_work_part < current_num_parts;
7929 current_concurrent_num_parts =
7930 std::min(current_num_parts - current_work_part,
7931 this->max_concurrent_part_calculation);
7934 auto local_device_num_partitioning_in_current_dim =
7935 device_num_partitioning_in_current_dim;
7936 Kokkos::parallel_reduce(
"Read bDoingWork",
7937 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
7938 KOKKOS_LAMBDA(
int dummy,
int & set_single) {
7941 if(local_device_num_partitioning_in_current_dim(
7942 current_work_part + kk) != 1) {
7948 bool bDoingWork = (bDoingWork_int != 0) ?
true :
false;
7950 this->mj_get_local_min_max_coord_totW(
7952 current_concurrent_num_parts,
7953 mj_current_dim_coords);
7958 this->mj_get_global_min_max_coord_totW(
7959 current_concurrent_num_parts,
7960 this->process_local_min_max_coord_total_weight,
7961 this->global_min_max_coord_total_weight);
7965 mj_part_t total_incomplete_cut_count = 0;
7970 mj_part_t concurrent_part_cut_shift = 0;
7971 mj_part_t concurrent_part_part_shift = 0;
7975 Kokkos::deep_copy(host_global_min_max_coord_total_weight,
7976 global_min_max_coord_total_weight);
7978 mj_scalar_t min_coordinate =
7979 host_global_min_max_coord_total_weight(kk);
7980 mj_scalar_t max_coordinate =
7981 host_global_min_max_coord_total_weight(
7982 kk + current_concurrent_num_parts);
7984 mj_scalar_t global_total_weight =
7985 host_global_min_max_coord_total_weight(
7986 kk + 2 * current_concurrent_num_parts);
7988 mj_part_t concurrent_current_part_index = current_work_part + kk;
7990 mj_part_t partition_count = host_num_partitioning_in_current_dim(
7991 concurrent_current_part_index);
7993 Kokkos::View<mj_scalar_t *, device_t> usedCutCoordinate =
7994 Kokkos::subview(current_cut_coordinates,
7995 std::pair<mj_lno_t, mj_lno_t>(
7996 concurrent_part_cut_shift, current_cut_coordinates.size()));
7997 Kokkos::View<mj_scalar_t *, device_t>
7998 current_target_part_weights =
7999 Kokkos::subview(target_part_weights,
8000 std::pair<mj_lno_t, mj_lno_t>(
8001 concurrent_part_part_shift, target_part_weights.size()));
8004 concurrent_part_cut_shift += partition_count - 1;
8006 concurrent_part_part_shift += partition_count;
8010 if(partition_count > 1 && min_coordinate <= max_coordinate) {
8014 total_incomplete_cut_count += partition_count - 1;
8016 this->incomplete_cut_count(kk) = partition_count - 1;
8019 this->mj_get_initial_cut_coords_target_weights(
8022 partition_count - 1,
8023 global_total_weight,
8025 current_target_part_weights,
8026 future_num_part_in_parts,
8027 next_future_num_parts_in_parts,
8028 concurrent_current_part_index,
8029 obtained_part_index);
8031 mj_lno_t coordinate_end_index =
8032 host_part_xadj(concurrent_current_part_index);
8033 mj_lno_t coordinate_begin_index =
8034 concurrent_current_part_index==0 ? 0 :
8035 host_part_xadj(concurrent_current_part_index - 1);
8037 this->set_initial_coordinate_parts(
8040 coordinate_begin_index, coordinate_end_index,
8041 this->coordinate_permutations,
8042 mj_current_dim_coords,
8043 this->assigned_part_ids,
8049 this->incomplete_cut_count(kk) = 0;
8052 obtained_part_index += partition_count;
8057 double used_imbalance = 0;
8060 mj_timer_base_string +
"Problem_Partitioning Get Part Weights");
8063 mj_current_dim_coords,
8066 current_concurrent_num_parts,
8067 current_cut_coordinates,
8068 total_incomplete_cut_count,
8069 view_rectilinear_cut_count,
8070 view_total_reduction_size);
8073 mj_timer_base_string +
"Problem_Partitioning Get Part Weights");
8078 mj_part_t output_array_shift = 0;
8079 mj_part_t cut_shift = 0;
8080 size_t tlr_shift = 0;
8081 size_t partweight_array_shift = 0;
8084 mj_part_t current_concurrent_work_part = current_work_part + kk;
8086 mj_part_t num_parts = host_num_partitioning_in_current_dim(
8087 current_concurrent_work_part);
8090 int coordinateA_bigger_than_coordinateB =
8091 host_global_min_max_coord_total_weight(kk) >
8092 host_global_min_max_coord_total_weight(
8093 kk + current_concurrent_num_parts);
8095 if((num_parts != 1) && coordinateA_bigger_than_coordinateB) {
8098 auto local_new_part_xadj = this->new_part_xadj;
8099 Kokkos::parallel_for(
8100 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
8101 (0, num_parts), KOKKOS_LAMBDA (mj_part_t jj) {
8102 local_new_part_xadj(
8103 output_part_index + output_array_shift + jj) = 0;
8106 cut_shift += num_parts - 1;
8107 tlr_shift += (4 *(num_parts - 1) + 1);
8108 output_array_shift += num_parts;
8109 partweight_array_shift += (2 * (num_parts - 1) + 1);
8113 Kokkos::View<mj_scalar_t *, device_t>
8114 current_concurrent_cut_coordinate =
8115 Kokkos::subview(current_cut_coordinates,
8116 std::pair<mj_lno_t, mj_lno_t>(
8118 current_cut_coordinates.size()));
8119 Kokkos::View<mj_scalar_t *, device_t>
8120 used_local_cut_line_weight_to_left =
8121 Kokkos::subview(process_cut_line_weight_to_put_left,
8122 std::pair<mj_lno_t, mj_lno_t>(
8124 process_cut_line_weight_to_put_left.size()));
8126 this->thread_part_weight_work =
8128 this->thread_part_weights,
8129 std::pair<mj_lno_t, mj_lno_t>(
8130 partweight_array_shift,
8131 this->thread_part_weights.extent(0)));
8134 if(this->mj_keep_part_boxes) {
8136 for(mj_part_t j = 0; j < num_parts - 1; ++j) {
8137 mj_scalar_t temp_get_val;
8138 Kokkos::parallel_reduce(
"Read single",
8139 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
8140 KOKKOS_LAMBDA(
int dummy, mj_scalar_t & set_single) {
8141 set_single = current_concurrent_cut_coordinate(j);
8143 (*output_part_boxes)
8144 [output_array_shift + output_part_index + j].
8145 updateMinMax(temp_get_val, 1 , coordInd);
8146 (*output_part_boxes)
8147 [output_array_shift + output_part_index + j + 1].
8148 updateMinMax(temp_get_val, 0 , coordInd);
8153 Kokkos::View<mj_lno_t*, device_t> sub_new_part_xadj =
8154 Kokkos::subview(this->new_part_xadj,
8155 std::pair<mj_lno_t, mj_lno_t>(
8156 output_part_index + output_array_shift,
8157 this->new_part_xadj.size()));
8159 this->mj_create_new_partitions(
8161 current_concurrent_work_part,
8162 mj_current_dim_coords,
8163 current_concurrent_cut_coordinate,
8164 used_local_cut_line_weight_to_left,
8169 mj_lno_t coordinate_end = host_part_xadj(
8170 current_concurrent_work_part);
8171 mj_lno_t coordinate_begin =
8172 current_concurrent_work_part==0 ? 0 : host_part_xadj(
8173 current_concurrent_work_part - 1);
8177 mj_lno_t part_size = coordinate_end - coordinate_begin;
8181 auto local_new_part_xadj = this->new_part_xadj;
8182 Kokkos::parallel_for(
8183 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
8184 (0, 1), KOKKOS_LAMBDA (
int dummy) {
8185 local_new_part_xadj(
8186 output_part_index + output_array_shift) = part_size;
8189 auto subview_new_coordinate_permutations =
8190 Kokkos::subview(this->new_coordinate_permutations,
8191 std::pair<mj_lno_t, mj_lno_t>(
8193 coordinate_begin + part_size));
8194 auto subview_coordinate_permutations =
8195 Kokkos::subview(this->coordinate_permutations,
8196 std::pair<mj_lno_t, mj_lno_t>(
8198 coordinate_begin + part_size));
8199 Kokkos::deep_copy(subview_new_coordinate_permutations,
8200 subview_coordinate_permutations);
8202 cut_shift += num_parts - 1;
8203 output_array_shift += num_parts;
8204 partweight_array_shift += (2 * (num_parts - 1) + 1);
8214 mj_part_t num_parts =
8215 host_num_partitioning_in_current_dim(current_work_part + kk);
8219 auto local_new_part_xadj = this->new_part_xadj;
8220 Kokkos::parallel_for(
8221 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
8222 (0, num_parts), KOKKOS_LAMBDA (mj_part_t ii) {
8223 local_new_part_xadj(output_part_index+ii) +=
8224 output_coordinate_end_index;
8229 Kokkos::parallel_reduce(
"Read single",
8230 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
8231 KOKKOS_LAMBDA(
int dummy, mj_part_t & set_single) {
8233 local_new_part_xadj(output_part_index + num_parts - 1);
8235 output_coordinate_end_index = temp_get;
8237 output_part_index += num_parts;
8243 int current_world_size = this->comm->getSize();
8244 long migration_reduce_all_population =
8245 this->total_dim_num_reduce_all * current_world_size;
8246 bool is_migrated_in_current_dimension =
false;
8251 if(future_num_parts > 1 &&
8252 this->check_migrate_avoid_migration_option >= 0 &&
8253 current_world_size > 1) {
8255 mj_timer_base_string +
"Problem_Migration-" + istring);
8256 mj_part_t num_parts = output_part_count_in_dimension;
8258 if(this->mj_perform_migration(
8261 next_future_num_parts_in_parts,
8262 output_part_begin_index,
8263 migration_reduce_all_population,
8264 this->num_global_coords / (future_num_parts * current_num_parts),
8266 input_part_boxes, output_part_boxes) )
8268 is_migrated_in_current_dimension =
true;
8269 is_data_ever_migrated =
true;
8271 mj_timer_base_string +
"Problem_Migration-" + istring);
8274 this->total_dim_num_reduce_all /= num_parts;
8277 is_migrated_in_current_dimension =
false;
8279 mj_timer_base_string +
"Problem_Migration-" + istring);
8284 Kokkos::View<mj_lno_t*, device_t> tmp =
8285 this->coordinate_permutations;
8286 this->coordinate_permutations =
8287 this->new_coordinate_permutations;
8289 this->new_coordinate_permutations = tmp;
8290 if(!is_migrated_in_current_dimension) {
8291 this->total_dim_num_reduce_all -= current_num_parts;
8292 current_num_parts = output_part_count_in_dimension;
8297 local_part_xadj = this->new_part_xadj;
8298 this->host_part_xadj = Kokkos::create_mirror_view(
part_xadj);
8299 Kokkos::deep_copy(host_part_xadj,
part_xadj);
8301 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
"empty", 0);
8303 mj_timer_base_string +
"Problem_Partitioning_" + istring);
8308 delete future_num_part_in_parts;
8309 delete next_future_num_parts_in_parts;
8311 mj_timer_base_string +
"Problem_Partitioning");
8317 this->set_final_parts(
8319 output_part_begin_index,
8321 is_data_ever_migrated);
8323 result_assigned_part_ids_ = this->assigned_part_ids;
8324 result_mj_gnos_ = this->current_mj_gnos;
8326 mj_timer_base_string +
"Total");
8327 this->mj_env->debug(3,
"Out of MultiJagged");
8330 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
8331 typename mj_part_t,
typename mj_node_t>
8332 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t, mj_node_t>::
8337 if(this->mj_keep_part_boxes) {
8338 return this->kept_boxes;
8341 throw std::logic_error(
"Error: part boxes are not stored.");
8345 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
8346 typename mj_part_t,
typename mj_node_t>
8347 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t, mj_node_t>::
8353 mj_part_t ntasks = this->num_global_parts;
8354 int dim = (*localPartBoxes)[0].getDim();
8355 coord_t *localPartBoundaries =
new coord_t[ntasks * 2 *dim];
8357 memset(localPartBoundaries, 0,
sizeof(coord_t) * ntasks * 2 *dim);
8359 coord_t *globalPartBoundaries =
new coord_t[ntasks * 2 *dim];
8360 memset(globalPartBoundaries, 0,
sizeof(coord_t) * ntasks * 2 *dim);
8362 coord_t *localPartMins = localPartBoundaries;
8363 coord_t *localPartMaxs = localPartBoundaries + ntasks * dim;
8365 coord_t *globalPartMins = globalPartBoundaries;
8366 coord_t *globalPartMaxs = globalPartBoundaries + ntasks * dim;
8368 mj_part_t boxCount = localPartBoxes->size();
8369 for(mj_part_t i = 0; i < boxCount; ++i) {
8370 mj_part_t pId = (*localPartBoxes)[i].getpId();
8374 coord_t *lmins = (*localPartBoxes)[i].getlmins();
8375 coord_t *lmaxs = (*localPartBoxes)[i].getlmaxs();
8377 for(
int j = 0; j < dim; ++j) {
8378 localPartMins[dim * pId + j] = lmins[j];
8379 localPartMaxs[dim * pId + j] = lmaxs[j];
8392 reduceAll<int, coord_t>(*mj_problemComm, reductionOp,
8393 ntasks * 2 *dim, localPartBoundaries, globalPartBoundaries);
8395 RCP<mj_partBoxVector_t> pB(
new mj_partBoxVector_t(),
true);
8396 for(mj_part_t i = 0; i < ntasks; ++i) {
8398 globalPartMins + dim * i,
8399 globalPartMaxs + dim * i);
8412 delete []localPartBoundaries;
8413 delete []globalPartBoundaries;
8420 template <
typename Adapter>
8426 #ifndef DOXYGEN_SHOULD_SKIP_THIS
8430 typedef typename Adapter::scalar_t adapter_scalar_t;
8433 typedef float default_mj_scalar_t;
8439 (std::is_same<adapter_scalar_t, float>::value ||
8440 std::is_same<adapter_scalar_t, double>::value),
8441 adapter_scalar_t, default_mj_scalar_t>::type mj_scalar_t;
8448 typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
8449 typedef typename mj_node_t::device_type
device_t;
8454 RCP<const Environment> mj_env;
8455 RCP<const Comm<int> > mj_problemComm;
8456 RCP<const typename Adapter::base_adapter_t> mj_adapter;
8459 double imbalance_tolerance;
8463 size_t num_global_parts;
8466 Kokkos::View<mj_part_t*, Kokkos::HostSpace> part_no_array;
8469 int recursion_depth;
8472 mj_lno_t num_local_coords;
8473 mj_gno_t num_global_coords;
8476 Kokkos::View<const mj_gno_t*, device_t> initial_mj_gnos;
8480 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
8483 int num_weights_per_coord;
8486 Kokkos::View<bool*, Kokkos::HostSpace> mj_uniform_weights;
8489 Kokkos::View<mj_scalar_t**, device_t> mj_weights;
8492 Kokkos::View<bool*, Kokkos::HostSpace> mj_uniform_parts;
8502 mj_part_t num_first_level_parts;
8506 Kokkos::View<mj_part_t*, Kokkos::HostSpace> first_level_distribution;
8510 bool distribute_points_on_cut_lines;
8513 mj_part_t max_concurrent_part_calculation;
8516 int check_migrate_avoid_migration_option;
8524 double minimum_migration_imbalance;
8525 bool mj_keep_part_boxes;
8529 int mj_premigration_option;
8530 int min_coord_per_rank_for_premigration;
8533 ArrayRCP<mj_part_t> comXAdj_;
8536 ArrayRCP<mj_part_t> comAdj_;
8541 void set_input_parameters(
const Teuchos::ParameterList &p);
8543 RCP<mj_partBoxVector_t> getGlobalBoxBoundaries()
const;
8545 bool mj_premigrate_to_subset(
8547 int migration_selection_option,
8548 RCP<const Environment> mj_env_,
8549 RCP<
const Comm<int> > mj_problemComm_,
8551 mj_lno_t num_local_coords_,
8552 mj_gno_t num_global_coords_,
size_t num_global_parts_,
8553 Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos_,
8555 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> &
8557 int num_weights_per_coord_,
8558 Kokkos::View<mj_scalar_t**, device_t> & mj_weights_,
8560 RCP<
const Comm<int> > &result_problemComm_,
8561 mj_lno_t & result_num_local_coords_,
8562 Kokkos::View<mj_gno_t*, device_t> & result_initial_mj_gnos_,
8564 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> &
8565 result_mj_coordinates_,
8566 Kokkos::View<mj_scalar_t**, device_t> & result_mj_weights_,
8567 int * &result_actual_owner_rank_);
8572 RCP<
const Comm<int> > &problemComm,
8573 const RCP<const typename Adapter::base_adapter_t> &adapter) :
8576 mj_problemComm(problemComm),
8577 mj_adapter(adapter),
8578 imbalance_tolerance(0),
8580 num_global_parts(1),
8583 num_local_coords(0),
8584 num_global_coords(0),
8585 num_weights_per_coord(0),
8586 num_first_level_parts(1),
8587 distribute_points_on_cut_lines(true),
8588 max_concurrent_part_calculation(1),
8589 check_migrate_avoid_migration_option(0),
8591 minimum_migration_imbalance(0.30),
8592 mj_keep_part_boxes(false),
8593 mj_run_as_rcb(false),
8594 mj_premigration_option(0),
8595 min_coord_per_rank_for_premigration(32000),
8609 const bool bUnsorted =
true;
8610 RCP<Zoltan2::IntegerRangeListValidator<int>> mj_parts_Validator =
8612 pl.set(
"mj_parts",
"0",
"list of parts for multiJagged partitioning "
8613 "algorithm. As many as the dimension count.", mj_parts_Validator);
8615 pl.set(
"mj_concurrent_part_count", 1,
"The number of parts whose cut "
8616 "coordinates will be calculated concurently.",
8619 pl.set(
"mj_minimum_migration_imbalance", 1.1,
8620 "mj_minimum_migration_imbalance, the minimum imbalance of the "
8621 "processors to avoid migration",
8624 RCP<Teuchos::EnhancedNumberValidator<int>> mj_migration_option_validator =
8625 Teuchos::rcp(
new Teuchos::EnhancedNumberValidator<int>(0, 2) );
8626 pl.set(
"mj_migration_option", 1,
"Migration option, 0 for decision "
8627 "depending on the imbalance, 1 for forcing migration, 2 for "
8628 "avoiding migration", mj_migration_option_validator);
8630 RCP<Teuchos::EnhancedNumberValidator<int>> mj_migration_type_validator =
8631 Teuchos::rcp(
new Teuchos::EnhancedNumberValidator<int>(0, 1) );
8632 pl.set(
"mj_migration_type", 0,
8633 "Migration type, 0 for migration to minimize the imbalance "
8634 "1 for migration to minimize messages exchanged the migration.",
8635 mj_migration_option_validator);
8638 pl.set(
"mj_keep_part_boxes",
false,
"Keep the part boundaries of the "
8642 pl.set(
"mj_enable_rcb",
false,
"Use MJ as RCB.",
8645 pl.set(
"mj_recursion_depth", -1,
"Recursion depth for MJ: Must be "
8648 RCP<Teuchos::EnhancedNumberValidator<int>>
8649 mj_num_teams_validator =
8650 Teuchos::rcp(
new Teuchos::EnhancedNumberValidator<int>(
8651 0, Teuchos::EnhancedNumberTraits<int>::max()) );
8652 pl.set(
"mj_num_teams", 0,
8653 "How many teams for the main kernel loop"
8654 , mj_num_teams_validator);
8656 RCP<Teuchos::EnhancedNumberValidator<int>>
8657 mj_premigration_option_validator =
8658 Teuchos::rcp(
new Teuchos::EnhancedNumberValidator<int>(0, 1024) );
8660 pl.set(
"mj_premigration_option", 0,
8661 "Whether to do premigration or not. 0 for no migration "
8662 "x > 0 for migration to consecutive processors, "
8663 "the subset will be 0,x,2x,3x,...subset ranks."
8664 , mj_premigration_option_validator);
8666 pl.set(
"mj_premigration_coordinate_count", 32000,
"How many coordinate to "
8667 "assign each rank in multijagged after premigration"
8680 RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
8684 mj_part_t pointAssign(
int dim, adapter_scalar_t *point)
const;
8686 void boxAssign(
int dim, adapter_scalar_t *lower, adapter_scalar_t *upper,
8687 size_t &nPartsFound, mj_part_t **partsFound)
const;
8691 void getCommunicationGraph(
8693 ArrayRCP<mj_part_t> &comXAdj,
8694 ArrayRCP<mj_part_t> &comAdj);
8696 void set_up_partitioning_data(
8700 std::string timer_base_string;
8708 template<
class dst_t,
class src_t>
8709 typename std::enable_if<std::is_same<
typename dst_t::value_type,
8710 typename src_t::value_type>::value>::type
8711 assign_if_same(dst_t & dst,
const src_t & src) {
8714 template<
class dst_t,
class src_t>
8715 typename std::enable_if<!std::is_same<
typename dst_t::value_type,
8716 typename src_t::value_type>::value>::type
8717 assign_if_same(dst_t & dst,
const src_t & src) {
8722 template <
typename Adapter>
8723 bool Zoltan2_AlgMJ<Adapter>::mj_premigrate_to_subset(
8725 int migration_selection_option,
8726 RCP<const Environment> mj_env_,
8727 RCP<
const Comm<int> > mj_problemComm_,
8729 mj_lno_t num_local_coords_,
8730 mj_gno_t num_global_coords_,
size_t num_global_parts_,
8731 Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos_,
8733 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> & mj_coordinates_,
8734 int num_weights_per_coord_,
8735 Kokkos::View<mj_scalar_t**, device_t> & mj_weights_,
8737 RCP<
const Comm<int> > & result_problemComm_,
8738 mj_lno_t &result_num_local_coords_,
8739 Kokkos::View<mj_gno_t*, device_t> & result_initial_mj_gnos_,
8741 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> &
8742 result_mj_coordinates_,
8743 Kokkos::View<mj_scalar_t**, device_t> & result_mj_weights_,
8744 int * &result_actual_owner_rank_)
8747 timer_base_string +
"PreMigration DistributorPlanCreating");
8749 int myRank = mj_problemComm_->getRank();
8750 int worldSize = mj_problemComm_->getSize();
8752 mj_part_t groupsize = worldSize / used_num_ranks;
8754 std::vector<mj_part_t> group_begins(used_num_ranks + 1, 0);
8756 mj_part_t i_am_sending_to = 0;
8757 bool am_i_a_receiver =
false;
8759 for(
int i = 0; i < used_num_ranks; ++i) {
8760 group_begins[i+ 1] = group_begins[i] + groupsize;
8761 if(worldSize % used_num_ranks > i) group_begins[i+ 1] += 1;
8762 if(i == used_num_ranks) group_begins[i+ 1] = worldSize;
8763 if(myRank >= group_begins[i] && myRank < group_begins[i + 1]) {
8764 i_am_sending_to = group_begins[i];
8766 if(myRank == group_begins[i]) {
8767 am_i_a_receiver =
true;
8771 ArrayView<const mj_part_t> idView(&(group_begins[0]), used_num_ranks );
8772 result_problemComm_ = mj_problemComm_->createSubcommunicator(idView);
8774 Tpetra::Distributor distributor(mj_problemComm_);
8776 std::vector<mj_part_t>
8777 coordinate_destinations(num_local_coords_, i_am_sending_to);
8779 ArrayView<const mj_part_t>
8780 destinations(&(coordinate_destinations[0]), num_local_coords_);
8781 mj_lno_t num_incoming_gnos = distributor.createFromSends(destinations);
8782 result_num_local_coords_ = num_incoming_gnos;
8784 timer_base_string +
"PreMigration DistributorPlanCreating");
8787 timer_base_string +
"PreMigration DistributorMigration");
8795 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> sent_gnos(
8796 Kokkos::ViewAllocateWithoutInitializing(
"sent_gnos"),
8797 initial_mj_gnos_.size());
8798 Kokkos::deep_copy(sent_gnos, initial_mj_gnos_);
8800 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> received_gnos (
8801 Kokkos::ViewAllocateWithoutInitializing(
"received_gnos"),
8804 distributor.doPostsAndWaits(sent_gnos, 1, received_gnos);
8806 result_initial_mj_gnos_ = Kokkos::View<mj_gno_t*, device_t>(
8807 Kokkos::ViewAllocateWithoutInitializing(
"result_initial_mj_gnos_"),
8809 Kokkos::deep_copy(result_initial_mj_gnos_, received_gnos);
8815 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, Kokkos::HostSpace>
8816 host_src_coordinates(
8817 Kokkos::ViewAllocateWithoutInitializing(
"mj_coordinates"),
8818 this->mj_coordinates.extent(0), this->mj_coordinates.extent(1));
8820 Kokkos::deep_copy(host_src_coordinates, this->mj_coordinates);
8822 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> dst_coordinates(
8823 Kokkos::ViewAllocateWithoutInitializing(
"mj_coordinates"),
8824 num_incoming_gnos, this->coord_dim);
8826 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> received_coord(
8827 Kokkos::ViewAllocateWithoutInitializing(
"received_coord"),
8830 for(
int i = 0; i < this->coord_dim; ++i) {
8832 auto sent_coord = Kokkos::subview(host_src_coordinates, Kokkos::ALL, i);
8834 distributor.doPostsAndWaits(sent_coord, 1, received_coord);
8836 Kokkos::deep_copy(Kokkos::subview(dst_coordinates, Kokkos::ALL, i),
8840 result_mj_coordinates_ = dst_coordinates;
8844 Kokkos::View<mj_scalar_t**, device_t> dst_weights(
8845 Kokkos::ViewAllocateWithoutInitializing(
"mj_weights"),
8846 num_incoming_gnos, this->num_weights_per_coord);
8847 auto host_dst_weights = Kokkos::create_mirror_view(dst_weights);
8849 auto host_src_weights = Kokkos::create_mirror_view_and_copy(
8850 Kokkos::HostSpace(), this->mj_weights);
8853 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> sent_weight(
8854 Kokkos::ViewAllocateWithoutInitializing(
"send_weight_buffer"),
8855 this->num_local_coords);
8857 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> received_weight(
8858 Kokkos::ViewAllocateWithoutInitializing(
"received_weight_buffer"),
8861 for(
int i = 0; i < this->num_weights_per_coord; ++i) {
8863 auto sub_host_src_weights
8864 = Kokkos::subview(host_src_weights, Kokkos::ALL, i);
8865 auto sub_host_dst_weights
8866 = Kokkos::subview(host_dst_weights, Kokkos::ALL, i);
8874 for(mj_lno_t n = 0; n < this->num_local_coords; ++n) {
8875 sent_weight[n] = sub_host_src_weights(n);
8878 distributor.doPostsAndWaits(sent_weight, 1, received_weight);
8881 for(mj_lno_t n = 0; n < num_incoming_gnos; ++n) {
8882 sub_host_dst_weights(n) = received_weight[n];
8885 Kokkos::deep_copy(dst_weights, host_dst_weights);
8886 result_mj_weights_ = dst_weights;
8890 Kokkos::View<int*, Kokkos::HostSpace> sent_owners(
8891 Kokkos::ViewAllocateWithoutInitializing(
"sent_owners"),
8893 Kokkos::deep_copy(sent_owners, myRank);
8895 Kokkos::View<int*, Kokkos::HostSpace> received_owners(
8896 Kokkos::ViewAllocateWithoutInitializing(
"received_owners"),
8899 distributor.doPostsAndWaits(sent_owners, 1, received_owners);
8901 result_actual_owner_rank_ =
new int[num_incoming_gnos];
8903 result_actual_owner_rank_,
8904 received_owners.data(),
8905 num_incoming_gnos *
sizeof(int));
8909 timer_base_string +
"PreMigration DistributorMigration");
8910 return am_i_a_receiver;
8920 template <
typename Adapter>
8929 int execute_counter =
8931 timer_base_string =
"partition(" + std::to_string(execute_counter) +
") - ";
8933 this->mj_env->timerStart(
MACRO_TIMERS, timer_base_string +
"all");
8935 this->mj_env->timerStart(
MACRO_TIMERS, timer_base_string +
"setup");
8937 this->set_up_partitioning_data(solution);
8939 this->set_input_parameters(this->mj_env->getParameters());
8940 if(this->mj_keep_part_boxes) {
8941 this->mj_partitioner.set_to_keep_part_boxes();
8944 this->mj_partitioner.set_partitioning_parameters(
8945 this->distribute_points_on_cut_lines,
8946 this->max_concurrent_part_calculation,
8947 this->check_migrate_avoid_migration_option,
8948 this->minimum_migration_imbalance, this->migration_type);
8950 RCP<const Comm<int> > result_problemComm = this->mj_problemComm;
8951 mj_lno_t result_num_local_coords = this->num_local_coords;
8952 Kokkos::View<mj_gno_t*, device_t> result_initial_mj_gnos;
8954 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
8955 result_mj_coordinates = this->mj_coordinates;
8956 Kokkos::View<mj_scalar_t**, device_t> result_mj_weights =
8958 int *result_actual_owner_rank = NULL;
8960 Kokkos::View<const mj_gno_t*, device_t> result_initial_mj_gnos_ =
8961 this->initial_mj_gnos;
8979 int current_world_size = this->mj_problemComm->getSize();
8980 mj_lno_t threshold_num_local_coords =
8981 this->min_coord_per_rank_for_premigration;
8982 bool is_pre_migrated =
false;
8983 bool am_i_in_subset =
true;
8989 if(mj_premigration_option > 0 &&
8990 size_t (current_world_size) > this->num_global_parts &&
8991 this->num_global_coords < mj_gno_t (
8992 current_world_size * threshold_num_local_coords))
8994 if(this->mj_keep_part_boxes) {
8995 throw std::logic_error(
"Multijagged: mj_keep_part_boxes and "
8996 "mj_premigration_option are not supported together yet.");
8999 is_pre_migrated =
true;
9000 int migration_selection_option = mj_premigration_option;
9001 if(migration_selection_option * this->num_global_parts >
9002 (
size_t) (current_world_size)) {
9003 migration_selection_option =
9004 current_world_size / this->num_global_parts;
9007 int used_num_ranks = int (this->num_global_coords /
9008 float (threshold_num_local_coords) + 0.5);
9010 if(used_num_ranks == 0) {
9014 am_i_in_subset = this->mj_premigrate_to_subset(
9016 migration_selection_option,
9018 this->mj_problemComm,
9020 this->num_local_coords,
9021 this->num_global_coords,
9022 this->num_global_parts,
9023 this->initial_mj_gnos,
9024 this->mj_coordinates,
9025 this->num_weights_per_coord,
9029 result_num_local_coords,
9030 result_initial_mj_gnos,
9031 result_mj_coordinates,
9033 result_actual_owner_rank);
9035 result_initial_mj_gnos_ = result_initial_mj_gnos;
9038 Kokkos::View<mj_part_t *, device_t> result_assigned_part_ids;
9039 Kokkos::View<mj_gno_t*, device_t> result_mj_gnos;
9041 this->mj_env->timerStop(
MACRO_TIMERS, timer_base_string +
"setup");
9043 if(am_i_in_subset) {
9044 this->mj_partitioner.multi_jagged_part(
9047 this->imbalance_tolerance,
9049 this->num_global_parts,
9050 this->part_no_array,
9051 this->recursion_depth,
9053 result_num_local_coords,
9054 this->num_global_coords,
9055 result_initial_mj_gnos_,
9056 result_mj_coordinates,
9057 this->num_weights_per_coord,
9058 this->mj_uniform_weights,
9060 this->mj_uniform_parts,
9061 result_assigned_part_ids,
9066 this->mj_env->timerStart(
MACRO_TIMERS, timer_base_string +
"cleanup");
9069 std::unordered_map<mj_gno_t, mj_lno_t> localGidToLid;
9070 localGidToLid.reserve(result_num_local_coords);
9071 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> host_result_initial_mj_gnos(
9072 Kokkos::ViewAllocateWithoutInitializing(
"host_result_initial_mj_gnos"),
9073 result_initial_mj_gnos_.size());
9074 Kokkos::deep_copy(host_result_initial_mj_gnos, result_initial_mj_gnos_);
9075 for(mj_lno_t i = 0; i < result_num_local_coords; i++) {
9076 localGidToLid[host_result_initial_mj_gnos(i)] = i;
9079 ArrayRCP<mj_part_t> partId = arcp(
new mj_part_t[result_num_local_coords],
9080 0, result_num_local_coords,
true);
9081 auto host_result_assigned_part_ids =
9082 Kokkos::create_mirror_view(result_assigned_part_ids);
9083 Kokkos::deep_copy(host_result_assigned_part_ids, result_assigned_part_ids);
9084 auto host_result_mj_gnos = Kokkos::create_mirror_view(result_mj_gnos);
9085 Kokkos::deep_copy(host_result_mj_gnos, result_mj_gnos);
9086 for(mj_lno_t i = 0; i < result_num_local_coords; i++) {
9087 mj_lno_t origLID = localGidToLid[host_result_mj_gnos(i)];
9088 partId[origLID] = host_result_assigned_part_ids(i);
9093 if(is_pre_migrated) {
9094 this->mj_env->timerStart(
MACRO_TIMERS, timer_base_string +
9095 "PostMigration DistributorPlanCreating");
9096 Tpetra::Distributor distributor(this->mj_problemComm);
9098 ArrayView<const mj_part_t> actual_owner_destinations(
9099 result_actual_owner_rank , result_num_local_coords);
9101 mj_lno_t num_incoming_gnos = distributor.createFromSends(
9102 actual_owner_destinations);
9104 if(num_incoming_gnos != this->num_local_coords) {
9105 throw std::logic_error(
"Zoltan2 - Multijagged Post Migration - "
9106 "num incoming is not equal to num local coords");
9110 "PostMigration DistributorPlanCreating");
9112 "PostMigration DistributorMigration");
9114 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> received_gnos(
9115 Kokkos::ViewAllocateWithoutInitializing(
"received_gnos"),
9117 Kokkos::View<mj_part_t*, Kokkos::HostSpace> received_partids(
9118 Kokkos::ViewAllocateWithoutInitializing(
"received_partids"),
9121 distributor.doPostsAndWaits(host_result_initial_mj_gnos, 1,
9124 Kokkos::View<mj_part_t*, Kokkos::HostSpace> sent_partnos;
9125 if (partId.size() > 0) {
9126 sent_partnos = Kokkos::View<mj_part_t*, Kokkos::HostSpace>(
9127 partId.getRawPtr(), partId.size());
9129 distributor.doPostsAndWaits(sent_partnos, 1, received_partids);
9132 partId = arcp(
new mj_part_t[this->num_local_coords],
9133 0, this->num_local_coords,
true);
9136 std::unordered_map<mj_gno_t, mj_lno_t> localGidToLid2;
9137 localGidToLid2.reserve(this->num_local_coords);
9138 auto host_initial_mj_gnos =
9139 Kokkos::create_mirror_view(this->initial_mj_gnos);
9140 Kokkos::deep_copy(host_initial_mj_gnos,
9141 this->initial_mj_gnos);
9142 for(mj_lno_t i = 0; i < this->num_local_coords; i++) {
9143 localGidToLid2[host_initial_mj_gnos(i)] = i;
9146 for(mj_lno_t i = 0; i < this->num_local_coords; i++) {
9147 mj_lno_t origLID = localGidToLid2[received_gnos[i]];
9148 partId[origLID] = received_partids[i];
9153 delete [] result_actual_owner_rank;
9156 timer_base_string +
"PostMigration DistributorMigration");
9158 solution->setParts(partId);
9159 this->mj_env->timerStop(
MACRO_TIMERS, timer_base_string +
"cleanup");
9162 this->mj_env->timerStop(
MACRO_TIMERS, timer_base_string +
"all");
9165 this->mj_coordinates = Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t>();
9170 template <
typename Adapter>
9183 int criteria_dim = (this->num_weights_per_coord ?
9184 this->num_weights_per_coord : 1);
9188 this->num_global_parts = solution->getTargetGlobalNumberOfParts();
9191 this->mj_uniform_parts = Kokkos::View<bool *, Kokkos::HostSpace>(
9192 "uniform parts", criteria_dim);
9193 this->mj_uniform_weights = Kokkos::View<bool *, Kokkos::HostSpace>(
9194 "uniform weights", criteria_dim);
9196 Kokkos::View<const mj_gno_t *, device_t> gnos;
9197 Kokkos::View<adapter_scalar_t **, Kokkos::LayoutLeft, device_t> xyz_adapter;
9199 Kokkos::View<adapter_scalar_t **, device_t> wgts_adapter;
9202 Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t> xyz;
9203 Kokkos::View<mj_scalar_t **, device_t> wgts;
9207 if(std::is_same<mj_scalar_t, adapter_scalar_t>()) {
9210 assign_if_same(xyz, xyz_adapter);
9211 assign_if_same(wgts, wgts_adapter);
9216 xyz = Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t>
9217 (Kokkos::ViewAllocateWithoutInitializing(
9218 "xyz"), xyz_adapter.extent(0), xyz_adapter.extent(1));
9219 wgts = Kokkos::View<mj_scalar_t **, device_t>(
9220 Kokkos::ViewAllocateWithoutInitializing(
"wgts"),
9221 wgts_adapter.extent(0), wgts_adapter.extent(1));
9223 typedef typename Kokkos::View<mj_scalar_t **, device_t>::size_type view_size_t;
9224 Kokkos::parallel_for(
9225 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
9226 (0, xyz_adapter.extent(0)), KOKKOS_LAMBDA (
int i) {
9227 for(view_size_t n = 0; n < xyz_adapter.extent(1); ++n) {
9228 xyz(i, n) =
static_cast<mj_scalar_t
>(xyz_adapter(i, n));
9231 Kokkos::parallel_for(
9232 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
9233 (0, wgts.extent(0)), KOKKOS_LAMBDA (
int i) {
9234 for(view_size_t n = 0; n < wgts.extent(1); ++n) {
9235 wgts(i, n) =
static_cast<mj_scalar_t
>(wgts_adapter(i, n));
9241 this->initial_mj_gnos = gnos;
9243 this->mj_coordinates = xyz;
9246 if(this->num_weights_per_coord == 0) {
9247 this->mj_uniform_weights(0) =
true;
9248 Kokkos::resize(this->mj_weights, 0, 0);
9251 this->mj_weights = wgts;
9252 for(
int wdim = 0; wdim < this->num_weights_per_coord; ++wdim) {
9253 this->mj_uniform_weights(wdim) =
false;
9257 for(
int wdim = 0; wdim < criteria_dim; ++wdim) {
9258 if(solution->criteriaHasUniformPartSizes(wdim)) {
9259 this->mj_uniform_parts(wdim) =
true;
9262 printf(
"Error: MJ does not support non uniform target part weights\n");
9271 template <
typename Adapter>
9273 const Teuchos::ParameterList &pl)
9275 const Teuchos::ParameterEntry *pe = pl.getEntryPtr(
"imbalance_tolerance");
9278 tol = pe->getValue(&tol);
9279 this->imbalance_tolerance = tol - 1.0;
9283 if(this->imbalance_tolerance <= 0) {
9284 this->imbalance_tolerance= 10e-4;
9288 Kokkos::resize(this->part_no_array, 0);
9291 this->recursion_depth = 0;
9293 if(pl.getPtr<
int>(
"mj_num_teams")) {
9294 this->num_teams = pl.get<
int>(
"mj_num_teams");
9297 if(pl.getPtr<Array <mj_part_t> >(
"mj_parts")) {
9298 auto mj_parts = pl.get<Array <mj_part_t> >(
"mj_parts");
9299 int mj_parts_size =
static_cast<int>(mj_parts.size());
9302 this->part_no_array = Kokkos::View<mj_part_t*, Kokkos::HostSpace>(
9303 "part_no_array", mj_parts_size);
9304 for(
int i = 0; i < mj_parts_size; ++i) {
9305 this->part_no_array(i) = mj_parts.getRawPtr()[i];
9308 this->recursion_depth = mj_parts_size - 1;
9309 this->mj_env->debug(2,
"mj_parts provided by user");
9313 this->distribute_points_on_cut_lines =
true;
9314 this->max_concurrent_part_calculation = 1;
9316 this->mj_run_as_rcb =
false;
9317 this->mj_premigration_option = 0;
9318 this->min_coord_per_rank_for_premigration = 32000;
9320 int mj_user_recursion_depth = -1;
9321 this->mj_keep_part_boxes =
false;
9322 this->check_migrate_avoid_migration_option = 0;
9323 this->migration_type = 0;
9324 this->minimum_migration_imbalance = 0.35;
9326 pe = pl.getEntryPtr(
"mj_minimum_migration_imbalance");
9329 imb = pe->getValue(&imb);
9330 this->minimum_migration_imbalance = imb - 1.0;
9333 pe = pl.getEntryPtr(
"mj_migration_option");
9335 this->check_migrate_avoid_migration_option =
9336 pe->getValue(&this->check_migrate_avoid_migration_option);
9338 this->check_migrate_avoid_migration_option = 0;
9340 if(this->check_migrate_avoid_migration_option > 1) {
9341 this->check_migrate_avoid_migration_option = -1;
9345 pe = pl.getEntryPtr(
"mj_migration_type");
9347 this->migration_type = pe->getValue(&this->migration_type);
9349 this->migration_type = 0;
9355 pe = pl.getEntryPtr(
"mj_concurrent_part_count");
9357 this->max_concurrent_part_calculation =
9358 pe->getValue(&this->max_concurrent_part_calculation);
9360 this->max_concurrent_part_calculation = 1;
9363 pe = pl.getEntryPtr(
"mj_keep_part_boxes");
9365 this->mj_keep_part_boxes = pe->getValue(&this->mj_keep_part_boxes);
9367 this->mj_keep_part_boxes =
false;
9378 if(this->mj_keep_part_boxes ==
false) {
9379 pe = pl.getEntryPtr(
"mapping_type");
9381 int mapping_type = -1;
9382 mapping_type = pe->getValue(&mapping_type);
9383 if(mapping_type == 0) {
9384 mj_keep_part_boxes =
true;
9390 pe = pl.getEntryPtr(
"mj_enable_rcb");
9392 this->mj_run_as_rcb = pe->getValue(&this->mj_run_as_rcb);
9394 this->mj_run_as_rcb =
false;
9397 pe = pl.getEntryPtr(
"mj_premigration_option");
9399 mj_premigration_option = pe->getValue(&mj_premigration_option);
9401 mj_premigration_option = 0;
9404 pe = pl.getEntryPtr(
"mj_premigration_coordinate_count");
9406 min_coord_per_rank_for_premigration = pe->getValue(&mj_premigration_option);
9408 min_coord_per_rank_for_premigration = 32000;
9411 pe = pl.getEntryPtr(
"mj_recursion_depth");
9413 mj_user_recursion_depth = pe->getValue(&mj_user_recursion_depth);
9415 mj_user_recursion_depth = -1;
9419 pe = pl.getEntryPtr(
"rectilinear");
9421 val = pe->getValue(&val);
9424 this->distribute_points_on_cut_lines =
false;
9426 this->distribute_points_on_cut_lines =
true;
9429 if(this->mj_run_as_rcb) {
9430 mj_user_recursion_depth =
9431 (int)(ceil(log ((this->num_global_parts)) / log (2.0)));
9433 if(this->recursion_depth < 1) {
9434 if(mj_user_recursion_depth > 0) {
9435 this->recursion_depth = mj_user_recursion_depth;
9438 this->recursion_depth = this->coord_dim;
9444 template <
typename Adapter>
9447 adapter_scalar_t *lower,
9448 adapter_scalar_t *upper,
9449 size_t &nPartsFound,
9459 if(this->mj_keep_part_boxes) {
9462 RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
9464 size_t nBoxes = (*partBoxes).size();
9466 throw std::logic_error(
"no part boxes exist");
9470 RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
9472 if(globalBox->boxesOverlap(dim, lower, upper)) {
9474 std::vector<typename Adapter::part_t> partlist;
9477 for(
size_t i = 0; i < nBoxes; i++) {
9479 if((*partBoxes)[i].boxesOverlap(dim, lower, upper)) {
9481 partlist.push_back((*partBoxes)[i].getpId());
9503 *partsFound =
new mj_part_t[nPartsFound];
9504 for(
size_t i = 0; i < nPartsFound; i++)
9505 (*partsFound)[i] = partlist[i];
9517 throw std::logic_error(
"need to use keep_cuts parameter for boxAssign");
9522 template <
typename Adapter>
9525 adapter_scalar_t *point)
const
9531 if(this->mj_keep_part_boxes) {
9535 RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
9537 size_t nBoxes = (*partBoxes).size();
9539 throw std::logic_error(
"no part boxes exist");
9543 RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
9545 if(globalBox->pointInBox(dim, point)) {
9549 for(i = 0; i < nBoxes; i++) {
9551 if((*partBoxes)[i].pointInBox(dim, point)) {
9552 foundPart = (*partBoxes)[i].getpId();
9566 std::ostringstream oss;
9568 for(
int j = 0; j < dim; j++) oss << point[j] <<
" ";
9569 oss <<
") not found in domain";
9570 throw std::logic_error(oss.str());
9580 size_t closestBox = 0;
9581 coord_t minDistance = std::numeric_limits<coord_t>::max();
9582 coord_t *centroid =
new coord_t[dim];
9583 for(
size_t i = 0; i < nBoxes; i++) {
9584 (*partBoxes)[i].computeCentroid(centroid);
9587 for(
int j = 0; j < dim; j++) {
9588 diff = centroid[j] - point[j];
9591 if(sum < minDistance) {
9596 foundPart = (*partBoxes)[closestBox].getpId();
9603 throw std::logic_error(
"need to use keep_cuts parameter for pointAssign");
9607 template <
typename Adapter>
9613 if(comXAdj_.getRawPtr() == NULL && comAdj_.getRawPtr() == NULL) {
9614 RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
9615 mj_part_t ntasks = (*pBoxes).size();
9616 int dim = (*pBoxes)[0].getDim();
9617 GridHash grid(pBoxes, ntasks, dim);
9624 template <
typename Adapter>
9625 RCP<typename Zoltan2_AlgMJ<Adapter>::mj_partBoxVector_t>
9628 return this->mj_partitioner.get_kept_boxes();
Zoltan2_MJArrayType< scalar_t > value_type
KOKKOS_INLINE_FUNCTION Zoltan2_MJArrayType()
KOKKOS_INLINE_FUNCTION void init(value_type dst) const
Kokkos::View< index_t *, device_t > part_xadj
GridHash Class, Hashing Class for part boxes.
void set_up_partitioning_data(const RCP< PartitioningSolution< Adapter > > &solution)
Time an algorithm (or other entity) as a whole.
global_size_t getGlobalNumCoordinates() const
Returns the global number coordinates.
void set(IT index_, CT count_, WT *vals_)
KOKKOS_INLINE_FUNCTION void join(value_type &dst, const value_type &src) const
int value_count_rightleft
#define Z2_FORWARD_EXCEPTIONS
Forward an exception back through call stack.
KOKKOS_INLINE_FUNCTION void operator()(const member_type &teamMember, value_type teamSum) const
Kokkos::View< index_t *, device_t > track_on_cuts
static int get_counter_AlgMJ()
Defines Parameter related enumerators, declares functions.
KOKKOS_INLINE_FUNCTION ArrayReducer(value_type &val, int mj_value_count)
static RCP< Teuchos::BoolParameterEntryValidator > getBoolValidator()
Exists to make setting up validators less cluttered.
KOKKOS_INLINE_FUNCTION void join(value_type dst, const value_type src) const
void getAdjArrays(ArrayRCP< part_t > &comXAdj_, ArrayRCP< part_t > &comAdj_)
GridHash Class, returns the adj arrays.
Zoltan2_AlgMJ(const RCP< const Environment > &env, RCP< const Comm< int > > &problemComm, const RCP< const typename Adapter::base_adapter_t > &adapter)
Zoltan2_BoxBoundaries(Ordinal s_)
Constructor.
Kokkos::View< scalar_t *, device_t > coordinates
Sort items for quick sort function.
std::bitset< NUM_MODEL_FLAGS > modelFlag_t
void uqSignsort(IT n, uSignedSortItem< IT, WT, SIGN > *arr)
Quick sort function. Sorts the arr of uSignedSortItems, with respect to increasing vals...
Kokkos::View< index_t *, device_t > permutations
map_t::global_ordinal_type gno_t
Class for sorting items with multiple values. First sorting with respect to val[0], then val[1] then ... val[count-1]. The last tie breaking is done with index values. Used for task mapping partitioning where the points on a cut line needs to be distributed consistently.
static RCP< Teuchos::AnyNumberParameterEntryValidator > getAnyDoubleValidator()
Exists to make setting up validators less cluttered.
Kokkos::View< scalar_t **, device_t > weights
KOKKOS_INLINE_FUNCTION void join(value_type dst, const value_type src) const
ArrayCombinationReducer reducer
void partition(const RCP< PartitioningSolution< Adapter > > &solution)
Multi Jagged coordinate partitioning algorithm.
void reduce(const Ordinal count, const T inBuffer[], T inoutBuffer[]) const
Implement Teuchos::ValueTypeReductionOp interface.
KOKKOS_INLINE_FUNCTION value_type & reference() const
Kokkos::View< scalar_t * > scalar_view_t
coordinateModelPartBox Class, represents the boundaries of the box which is a result of a geometric p...
KOKKOS_INLINE_FUNCTION void init(value_type &dst) const
void set_to_keep_part_boxes()
Function call, if the part boxes are intended to be kept.
A ParameterList validator for integer range lists.
static int get_counter_Zoltan2_AlgMJ()
void getCommunicationGraph(const PartitioningSolution< Adapter > *solution, ArrayRCP< mj_part_t > &comXAdj, ArrayRCP< mj_part_t > &comAdj)
returns communication graph resulting from MJ partitioning.
SparseMatrixAdapter_t::part_t part_t
Multi Jagged coordinate partitioning algorithm.
This class provides geometric coordinates with optional weights to the Zoltan2 algorithm.
policy_t::member_type member_type
void boxAssign(int dim, adapter_scalar_t *lower, adapter_scalar_t *upper, size_t &nPartsFound, mj_part_t **partsFound) const
Kokkos::View< scalar_t *, device_t > cut_coordinates
A PartitioningSolution is a solution to a partitioning problem.
Zoltan2_BoxBoundaries()
Default Constructor.
Kokkos::View< index_t *, device_t > permutations
void set_partitioning_parameters(bool distribute_points_on_cut_lines_, int max_concurrent_part_calculation_, int check_migrate_avoid_migration_option_, double minimum_migration_imbalance_, int migration_type_=0)
Multi Jagged coordinate partitioning algorithm.
KOKKOS_INLINE_FUNCTION value_type & reference() const
size_t getCoordinatesKokkos(Kokkos::View< const gno_t *, typename node_t::device_type > &Ids, Kokkos::View< scalar_t **, Kokkos::LayoutLeft, typename node_t::device_type > &xyz, Kokkos::View< scalar_t **, typename node_t::device_type > &wgts) const
Returns the coordinate ids, values and optional weights.
AlgMJ()
Multi Jagged coordinate partitioning algorithm default constructor.
Kokkos::View< part_t *, device_t > parts
Zoltan2_BoxBoundaries is a reduction operation to all reduce the all box boundaries.
ReduceWeightsFunctor(int mj_loop_count, array_t mj_max_scalar, part_t mj_concurrent_current_part, part_t mj_num_cuts, part_t mj_current_work_part, part_t mj_current_concurrent_num_parts, part_t mj_left_right_array_size, part_t mj_weight_array_size, Kokkos::View< index_t *, device_t > &mj_permutations, Kokkos::View< scalar_t *, device_t > &mj_coordinates, Kokkos::View< scalar_t **, device_t > &mj_weights, Kokkos::View< part_t *, device_t > &mj_parts, Kokkos::View< scalar_t *, device_t > &mj_cut_coordinates, Kokkos::View< index_t *, device_t > &mj_part_xadj, bool mj_uniform_weights0, scalar_t mj_sEpsilon)
Algorithm defines the base class for all algorithms.
map_t::local_ordinal_type lno_t
mj_partBoxVector_t & getPartBoxesView() const
for partitioning methods, return bounding boxes of the
#define Z2_ASSERT_VALUE(actual, expected)
Throw an error when actual value is not equal to expected value.
RCP< mj_partBoxVector_t > get_kept_boxes() const
DOCWORK: Documentation.
static RCP< Teuchos::AnyNumberParameterEntryValidator > getAnyIntValidator()
Exists to make setting up validators less cluttered.
part_t concurrent_current_part
Kokkos::View< part_t *, device_t > parts
RCP< mj_partBoxVector_t > compute_global_box_boundaries(RCP< mj_partBoxVector_t > &localPartBoxes) const
DOCWORK: Documentation.
uMultiSortItem(IT index_, CT count_, WT *vals_)
KOKKOS_INLINE_FUNCTION ArrayCombinationReducer(scalar_t mj_max_scalar, value_type &val, int mj_value_count_rightleft, int mj_value_count_weights)
KOKKOS_INLINE_FUNCTION void init(value_type dst) const
size_t getLocalNumCoordinates() const
Returns the number of coordinates on this process.
int getNumWeightsPerCoordinate() const
Returns the number (0 or greater) of weights per coordinate.
Define IntegerRangeList validator.
size_t team_shmem_size(int team_size) const
Defines the CoordinateModel classes.
void multi_jagged_part(const RCP< const Environment > &env, RCP< const Comm< int > > &problemComm, double imbalance_tolerance, int num_teams, size_t num_global_parts, Kokkos::View< mj_part_t *, Kokkos::HostSpace > &part_no_array, int recursion_depth, int coord_dim, mj_lno_t num_local_coords, mj_gno_t num_global_coords, Kokkos::View< const mj_gno_t *, device_t > &initial_mj_gnos, Kokkos::View< mj_scalar_t **, Kokkos::LayoutLeft, device_t > &mj_coordinates, int num_weights_per_coord, Kokkos::View< bool *, Kokkos::HostSpace > &mj_uniform_weights, Kokkos::View< mj_scalar_t **, device_t > &mj_weights, Kokkos::View< bool *, Kokkos::HostSpace > &mj_uniform_parts, Kokkos::View< mj_part_t *, device_t > &result_assigned_part_ids, Kokkos::View< mj_gno_t *, device_t > &result_mj_gnos)
Multi Jagged coordinate partitioning algorithm.
part_t concurrent_current_part
KOKKOS_INLINE_FUNCTION void join(value_type &dst, const value_type &src) const
Kokkos::View< scalar_t * > scalar_view_t
policy_t::member_type member_type
Tpetra::global_size_t global_size_t
Zoltan2_MJArrayType< scalar_t > value_type
part_t current_concurrent_num_parts
mj_part_t pointAssign(int dim, adapter_scalar_t *point) const
KOKKOS_INLINE_FUNCTION void init(value_type &dst) const
#define Z2_THROW_OUTSIDE_ERROR(env)
Throw an error returned from outside the Zoltan2 library.
int value_count_rightleft
int getCoordinateDim() const
Returns the dimension of the coordinates.
Kokkos::View< scalar_t *, device_t > coordinates
ReduceArrayFunctor(part_t mj_concurrent_current_part, part_t mj_weight_array_size, Kokkos::View< index_t *, device_t > &mj_permutations, Kokkos::View< scalar_t *, device_t > &mj_coordinates, Kokkos::View< part_t *, device_t > &mj_parts, Kokkos::View< index_t *, device_t > &mj_part_xadj, Kokkos::View< index_t *, device_t > &mj_track_on_cuts)
Kokkos::View< index_t *, device_t > part_xadj
KOKKOS_INLINE_FUNCTION Zoltan2_MJArrayType(scalar_t *pSetPtr)
A gathering of useful namespace methods.
void uqsort(IT n, uSortItem< IT, WT > *arr)
Quick sort function. Sorts the arr of uSortItems, with respect to increasing vals. DOCWORK: Document input params.
Contains Teuchos redcution operators for the Multi-jagged algorthm.
RCP< mj_partBox_t > get_global_box() const
DOCWORK: Documentation.
KOKKOS_INLINE_FUNCTION void join(volatile value_type &dst, const volatile value_type &src) const
size_t team_shmem_size(int team_size) const
Multi Jagged coordinate partitioning algorithm.
static void getValidParameters(ParameterList &pl)
Set up validators specific to this algorithm.
void sequential_task_partitioning(const RCP< const Environment > &env, mj_lno_t num_total_coords, mj_lno_t num_selected_coords, size_t num_target_part, int coord_dim, Kokkos::View< mj_scalar_t **, Kokkos::LayoutLeft, device_t > &mj_coordinates_, Kokkos::View< mj_lno_t *, device_t > &initial_selected_coords_output_permutation, mj_lno_t *output_xadj, int recursion_depth_, const Kokkos::View< mj_part_t *, Kokkos::HostSpace > &part_no_array, bool partition_along_longest_dim, int num_ranks_per_node, bool divide_to_prime_first_, mj_part_t num_first_level_parts_=1, const Kokkos::View< mj_part_t *, Kokkos::HostSpace > &first_level_distribution_=Kokkos::View< mj_part_t *, Kokkos::HostSpace >())
Special function for partitioning for task mapping. Runs sequential, and performs deterministic parti...