14 #ifndef _ZOLTAN2_ALGMultiJagged_HPP_
15 #define _ZOLTAN2_ALGMultiJagged_HPP_
24 #include <Tpetra_Distributor.hpp>
25 #include <Teuchos_StandardParameterEntryValidators.hpp>
26 #include <Teuchos_ParameterList.hpp>
27 #include <Kokkos_Sort.hpp>
31 #include <unordered_map>
33 #ifdef ZOLTAN2_USEZOLTANCOMM
34 #ifdef HAVE_ZOLTAN2_MPI
35 #define ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
36 #include "zoltan_comm_cpp.h"
37 #include "zoltan_types.h"
46 template <
typename Ordinal,
typename T>
57 epsilon(std::numeric_limits<T>::epsilon()) {}
63 size(s_), epsilon(std::numeric_limits<T>::epsilon()) {}
70 void reduce(
const Ordinal count,
const T inBuffer[], T inoutBuffer[])
const {
71 for(Ordinal i = 0; i < count; i++) {
72 if(
Z2_ABS(inBuffer[i]) > epsilon) {
73 inoutBuffer[i] = inBuffer[i];
89 template <
typename IT,
typename CT,
typename WT>
109 this->
index = index_;
110 this->
count = count_;
118 void set(IT index_ ,CT count_, WT *vals_) {
119 this->
index = index_;
120 this->
count = count_;
124 bool operator<(const uMultiSortItem<IT,CT,WT>& other)
const {
125 assert(this->
count == other.count);
126 for(CT i = 0; i < this->
count; ++i) {
128 if(std::abs(this->
val[i] - other.val[i]) < this->epsilon) {
132 if(this->
val[i] < other.val[i]) {
141 return this->
index < other.index;
147 template <
class IT,
class WT>
158 template <
class IT,
class WT>
160 const int NSTACK = 50;
162 IT i, ir=n, j, k, l=1;
163 IT jstack=0, istack[NSTACK];
170 for(j=l+1;j<=ir;j++) {
173 for(i=j-1;i>=1;i--) {
174 if(arr[i].val <= aval)
187 std::swap(arr[k],arr[l+1]);
188 if(arr[l+1].val > arr[ir].val) {
189 std::swap(arr[l+1],arr[ir]);
191 if(arr[l].val > arr[ir].val) {
192 std::swap(arr[l],arr[ir]);
194 if(arr[l+1].val > arr[l].val) {
195 std::swap(arr[l+1],arr[l]);
202 do i++;
while (arr[i].val < aval);
203 do j--;
while (arr[j].val > aval);
205 std::swap(arr[i],arr[j]);
210 if(jstack > NSTACK) {
211 std::cout <<
"uqsort: NSTACK too small in sort." << std::endl;
228 template <
class IT,
class WT,
class SIGN>
234 bool operator<(const uSignedSortItem<IT, WT, SIGN>& rhs)
const {
236 if(this->
signbit < rhs.signbit) {
240 else if(this->
signbit == rhs.signbit) {
241 if(this->
val < rhs.val) {
245 else if(this->
val > rhs.val) {
259 bool operator<=(const uSignedSortItem<IT, WT, SIGN>& rhs) {
260 return (this->
val == rhs.val && this->signbit == rhs.signbit) || (*
this < rhs);
267 template <
class IT,
class WT,
class SIGN>
269 const IT NSTACK = 50;
271 IT i, ir=n, j, k, l=1;
272 IT jstack=0, istack[NSTACK];
278 for(j=l+1;j<=ir;j++) {
280 for(i=j-1;i>=1;i--) {
296 std::swap(arr[k],arr[l+1]);
297 if(arr[ir] < arr[l+1]) {
298 std::swap(arr[l+1],arr[ir]);
300 if(arr[ir] < arr[l] ) {
301 std::swap(arr[l],arr[ir]);
303 if(arr[l] < arr[l+1]) {
304 std::swap(arr[l+1],arr[l]);
310 do i++;
while (arr[i] < a);
311 do j--;
while (a < arr[j]);
313 std::swap(arr[i],arr[j]);
318 if(jstack > NSTACK) {
319 std::cout <<
"uqsort: NSTACK too small in sort." << std::endl;
355 static int counter = 0;
359 static int counter = 0;
366 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
367 typename mj_part_t,
typename mj_node_t>
371 typedef typename mj_node_t::device_type device_t;
373 typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
378 static constexpr
size_t future_reduceall_cutoff = 1500000;
382 static constexpr mj_lno_t min_work_last_dim = 1000;
384 static constexpr mj_scalar_t least_signifiance = 0.0001;
385 static constexpr
int significance_mul = 1000;
387 std::string mj_timer_base_string;
389 RCP<const Environment> mj_env;
390 RCP<const Comm<int> > mj_problemComm;
391 RCP<Comm<int> > comm;
392 double imbalance_tolerance;
395 int num_weights_per_coord;
396 size_t initial_num_loc_coords;
398 mj_lno_t num_local_coords;
399 mj_gno_t num_global_coords;
400 mj_scalar_t sEpsilon;
403 bool distribute_points_on_cut_lines;
406 mj_part_t max_concurrent_part_calculation;
409 int mj_user_recursion_depth;
410 bool mj_keep_part_boxes;
413 int check_migrate_avoid_migration_option;
420 double minimum_migration_imbalance;
437 mj_part_t num_first_level_parts;
441 Kokkos::View<mj_part_t*, Kokkos::HostSpace> first_level_distribution;
443 mj_part_t total_num_cut ;
444 mj_part_t total_num_part;
446 mj_part_t max_num_part_along_dim ;
447 mj_part_t max_num_cut_along_dim;
450 size_t max_num_total_part_along_dim;
452 mj_part_t total_dim_num_reduce_all;
456 mj_part_t last_dim_num_part;
459 Kokkos::View<mj_part_t *, Kokkos::HostSpace> part_no_array;
463 Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t>
467 Kokkos::View<mj_scalar_t **, device_t> mj_weights;
470 Kokkos::View<bool *, Kokkos::HostSpace> mj_uniform_parts;
473 Kokkos::View<bool *, Kokkos::HostSpace> mj_uniform_weights;
477 size_t num_global_parts;
480 RCP<mj_partBoxVector_t> kept_boxes;
482 RCP<mj_partBox_t> global_box;
487 bool divide_to_prime_first;
490 Kokkos::View<const mj_gno_t*, device_t> initial_mj_gnos;
493 Kokkos::View<mj_gno_t*, device_t> current_mj_gnos;
496 Kokkos::View<int*, Kokkos::HostSpace> owner_of_coordinate;
499 Kokkos::View<mj_lno_t*, device_t> coordinate_permutations;
502 Kokkos::View<mj_lno_t*, device_t> new_coordinate_permutations;
505 Kokkos::View<mj_part_t*, device_t> assigned_part_ids;
508 Kokkos::View<mj_lno_t *, device_t> part_xadj;
511 Kokkos::View<mj_lno_t *, device_t> new_part_xadj;
513 Kokkos::View<mj_scalar_t *, device_t> all_cut_coordinates;
516 Kokkos::View<mj_scalar_t *, device_t>
517 process_cut_line_weight_to_put_left;
520 Kokkos::View<mj_scalar_t *, device_t>
521 thread_cut_line_weight_to_put_left;
527 Kokkos::View<mj_scalar_t *, device_t> cut_coordinates_work_array;
530 Kokkos::View<mj_scalar_t *, device_t> temp_cut_coords;
533 Kokkos::View<mj_scalar_t *, device_t> target_part_weights;
536 Kokkos::View<mj_scalar_t *, device_t> cut_upper_bound_coordinates;
539 Kokkos::View<mj_scalar_t *, device_t> cut_lower_bound_coordinates;
542 Kokkos::View<mj_scalar_t *, device_t> cut_lower_bound_weights;
545 Kokkos::View<mj_scalar_t *, device_t> cut_upper_bound_weights;
549 Kokkos::View<mj_scalar_t *, device_t>
550 process_local_min_max_coord_total_weight;
553 Kokkos::View<mj_scalar_t *, device_t>
554 global_min_max_coord_total_weight;
558 Kokkos::View<bool *, device_t> is_cut_line_determined;
564 Kokkos::View<mj_part_t *, device_t> device_incomplete_cut_count;
565 typename decltype(device_incomplete_cut_count)::HostMirror
566 incomplete_cut_count;
569 typename decltype (part_xadj)::HostMirror host_part_xadj;
572 Kokkos::View<double *, device_t>
576 Kokkos::View<double *, device_t>
577 thread_part_weight_work;
581 Kokkos::View<mj_scalar_t *, device_t>
582 thread_cut_left_closest_point;
586 Kokkos::View<mj_scalar_t *, device_t>
587 thread_cut_right_closest_point;
590 Kokkos::View<mj_lno_t *, device_t>
593 Kokkos::View<mj_scalar_t *, device_t> process_rectilinear_cut_weight;
594 Kokkos::View<mj_scalar_t *, device_t> global_rectilinear_cut_weight;
600 Kokkos::View<mj_scalar_t *, device_t>
601 total_part_weight_left_right_closests;
602 Kokkos::View<mj_scalar_t *, device_t>
603 global_total_part_weight_left_right_closests;
605 Kokkos::View<mj_part_t*, device_t> device_num_partitioning_in_current_dim;
606 typename decltype(device_num_partitioning_in_current_dim)::HostMirror
607 host_num_partitioning_in_current_dim;
614 KOKKOS_INLINE_FUNCTION
615 double calculate_imbalance(mj_scalar_t achieved, mj_scalar_t expected) {
616 return static_cast<double>(achieved) / static_cast<double>(expected) - 1.0;
625 void set_part_specifications();
632 inline mj_part_t get_part_count(
633 mj_part_t num_total_future,
642 void init_part_boxes(RCP<mj_partBoxVector_t> & outPartBoxes);
663 mj_part_t update_part_num_arrays(
664 std::vector<mj_part_t> *future_num_part_in_parts,
665 std::vector<mj_part_t> *next_future_num_parts_in_parts,
666 mj_part_t &future_num_parts,
667 mj_part_t current_num_parts,
668 int current_iteration,
669 RCP<mj_partBoxVector_t> input_part_boxes,
670 RCP<mj_partBoxVector_t> output_part_boxes,
671 mj_part_t atomic_part_count);
685 KOKKOS_INLINE_FUNCTION
686 void mj_calculate_new_cut_position (
687 mj_scalar_t cut_upper_bound,
688 mj_scalar_t cut_lower_bound,
689 mj_scalar_t cut_upper_weight,
690 mj_scalar_t cut_lower_weight,
691 mj_scalar_t expected_weight,
692 mj_scalar_t &new_cut_position,
693 mj_scalar_t sEpsilon);
719 bool mj_perform_migration(
720 mj_part_t in_num_parts,
721 mj_part_t &out_num_parts,
722 std::vector<mj_part_t> *next_future_num_parts_in_parts,
723 mj_part_t &output_part_begin_index,
724 size_t migration_reduce_all_population,
725 mj_lno_t num_coords_for_last_dim_part,
726 std::string iteration,
727 RCP<mj_partBoxVector_t> &input_part_boxes,
728 RCP<mj_partBoxVector_t> &output_part_boxes);
747 bool mj_check_to_migrate(
748 size_t migration_reduce_all_population,
749 mj_lno_t num_coords_for_last_dim_part,
752 mj_gno_t *num_points_in_all_processor_parts);
778 void mj_migration_part_proc_assignment(
779 mj_gno_t * num_points_in_all_processor_parts,
782 mj_lno_t *send_count_to_each_proc,
783 std::vector<mj_part_t> &processor_ranks_for_subcomm,
784 std::vector<mj_part_t> *next_future_num_parts_in_parts,
785 mj_part_t &out_num_part,
786 std::vector<mj_part_t> &out_part_indices,
787 mj_part_t &output_part_numbering_begin_index,
788 int *coordinate_destinations);
815 void mj_assign_proc_to_parts(
816 mj_gno_t * num_points_in_all_processor_parts,
819 mj_lno_t *send_count_to_each_proc,
820 std::vector<mj_part_t> &processor_ranks_for_subcomm,
821 std::vector<mj_part_t> *next_future_num_parts_in_parts,
822 mj_part_t &out_part_index,
823 mj_part_t &output_part_numbering_begin_index,
824 int *coordinate_destinations);
841 void assign_send_destinations(
843 mj_part_t *part_assignment_proc_begin_indices,
844 mj_part_t *processor_chains_in_parts,
845 mj_lno_t *send_count_to_each_proc,
846 int *coordinate_destinations);
862 void assign_send_destinations2(
865 int *coordinate_destinations,
866 mj_part_t &output_part_numbering_begin_index,
867 std::vector<mj_part_t> *next_future_num_parts_in_parts);
891 void mj_assign_parts_to_procs(
892 mj_gno_t * num_points_in_all_processor_parts,
895 mj_lno_t *send_count_to_each_proc,
896 std::vector<mj_part_t> *next_future_num_parts_in_parts,
897 mj_part_t &out_num_part,
898 std::vector<mj_part_t> &out_part_indices,
899 mj_part_t &output_part_numbering_begin_index,
900 int *coordinate_destinations);
915 void mj_migrate_coords(
917 mj_lno_t &num_new_local_points,
918 std::string iteration,
919 int *coordinate_destinations,
920 mj_part_t num_parts);
927 void create_sub_communicator(
928 std::vector<mj_part_t> &processor_ranks_for_subcomm);
934 mj_part_t find_largest_prime_factor(mj_part_t num_parts) {
935 mj_part_t largest_factor = 1;
936 mj_part_t n = num_parts;
937 mj_part_t divisor = 2;
939 while (n % divisor == 0) {
941 largest_factor = divisor;
944 if(divisor * divisor > n) {
951 return largest_factor;
984 const RCP<const Environment> &env,
985 RCP<
const Comm<int> > &problemComm,
986 double imbalance_tolerance,
988 size_t num_global_parts,
989 Kokkos::View<mj_part_t*, Kokkos::HostSpace> & part_no_array,
992 mj_lno_t num_local_coords,
993 mj_gno_t num_global_coords,
994 Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos,
996 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> & mj_coordinates,
997 int num_weights_per_coord,
998 Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_weights,
999 Kokkos::View<mj_scalar_t**, device_t> & mj_weights,
1000 Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_parts,
1001 Kokkos::View<mj_part_t*, device_t> & result_assigned_part_ids,
1002 Kokkos::View<mj_gno_t*, device_t> & result_mj_gnos);
1017 bool distribute_points_on_cut_lines_,
1018 int max_concurrent_part_calculation_,
1019 int check_migrate_avoid_migration_option_,
1020 double minimum_migration_imbalance_,
1021 int migration_type_ = 0);
1038 RCP<mj_partBoxVector_t> &localPartBoxes)
const;
1079 const RCP<const Environment> &env,
1080 mj_lno_t num_total_coords,
1081 mj_lno_t num_selected_coords,
1082 size_t num_target_part,
1085 Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t> & mj_coordinates_,
1086 Kokkos::View<mj_lno_t *, device_t> &
1087 initial_selected_coords_output_permutation,
1088 mj_lno_t *output_xadj,
1089 int recursion_depth_,
1090 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & part_no_array,
1091 bool partition_along_longest_dim,
1092 int num_ranks_per_node,
1093 bool divide_to_prime_first_,
1094 mj_part_t num_first_level_parts_ = 1,
1095 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & first_level_distribution_
1096 = Kokkos::View<mj_part_t *, Kokkos::HostSpace>());
1098 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
1106 void allocate_set_work_memory();
1109 void compute_global_box();
1118 void mj_get_local_min_max_coord_totW(
1119 mj_part_t current_work_part,
1120 mj_part_t current_concurrent_num_parts,
1121 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords);
1135 void mj_get_global_min_max_coord_totW(
1136 mj_part_t current_concurrent_num_parts,
1137 Kokkos::View<mj_scalar_t *, device_t> & local_min_max_total,
1138 Kokkos::View<mj_scalar_t *, device_t> & global_min_max_total);
1170 void mj_get_initial_cut_coords_target_weights(
1171 mj_scalar_t min_coord,
1172 mj_scalar_t max_coord,
1173 mj_part_t num_cuts ,
1174 mj_scalar_t global_weight,
1175 Kokkos::View<mj_scalar_t *, device_t> & initial_cut_coords,
1176 Kokkos::View<mj_scalar_t *, device_t> & target_part_weights,
1177 std::vector <mj_part_t> *future_num_part_in_parts,
1178 std::vector <mj_part_t> *next_future_num_parts_in_parts,
1179 mj_part_t concurrent_current_part,
1180 mj_part_t obtained_part_index,
1181 mj_part_t num_target_first_level_parts = 1,
1182 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & target_first_level_dist =
1183 Kokkos::View<mj_part_t *, Kokkos::HostSpace>());
1201 void set_initial_coordinate_parts(
1202 mj_scalar_t &max_coordinate,
1203 mj_scalar_t &min_coordinate,
1204 mj_lno_t coordinate_begin_index,
1205 mj_lno_t coordinate_end_index,
1206 Kokkos::View<mj_lno_t *, device_t> &
1207 mj_current_coordinate_permutations,
1208 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1209 Kokkos::View<mj_part_t *, device_t> & mj_part_ids,
1210 mj_part_t &partition_count);
1229 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1230 double imbalanceTolerance,
1231 mj_part_t current_work_part,
1232 mj_part_t current_concurrent_num_parts,
1233 Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
1234 mj_part_t total_incomplete_cut_count,
1235 Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count,
1236 Kokkos::View<size_t*, device_t> & view_total_reduction_size);
1243 void mj_1D_part_get_part_weights(
1244 mj_part_t current_concurrent_num_parts,
1245 mj_part_t current_work_part,
1246 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1256 void mj_combine_rightleft_and_weights(
1257 mj_part_t current_work_part,
1258 mj_part_t current_concurrent_num_parts);
1272 void mj_create_new_partitions(
1273 mj_part_t num_parts,
1274 mj_part_t current_concurrent_work_part,
1275 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1276 Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
1277 Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
1278 Kokkos::View<mj_lno_t *, device_t> & out_part_xadj);
1315 void mj_get_new_cut_coordinates(
1316 mj_part_t current_concurrent_num_parts,
1318 const mj_part_t &num_cuts,
1319 const double &used_imbalance_tolerance,
1320 Kokkos::View<mj_scalar_t *, device_t> & current_global_part_weights,
1321 Kokkos::View<mj_scalar_t *, device_t> & current_local_part_weights,
1322 Kokkos::View<mj_scalar_t *, device_t> & current_part_target_weights,
1323 Kokkos::View<bool *, device_t> & current_cut_line_determined,
1324 Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
1325 Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_bounds,
1326 Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bounds,
1327 Kokkos::View<mj_scalar_t *, device_t> & current_global_left_closest_points,
1328 Kokkos::View<mj_scalar_t *, device_t> & current_global_right_closest_points,
1329 Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bound_weights,
1330 Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_weights,
1331 Kokkos::View<mj_scalar_t *, device_t> & new_current_cut_coordinates,
1332 Kokkos::View<mj_scalar_t *, device_t> &
1333 current_part_cut_line_weight_to_put_left,
1334 Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count);
1345 void get_processor_num_points_in_parts(
1346 mj_part_t num_procs,
1347 mj_part_t num_parts,
1348 mj_gno_t *&num_points_in_all_processor_parts);
1354 void fill_permutation_array(
1355 mj_part_t output_num_parts,
1356 mj_part_t num_parts);
1379 void create_consistent_chunks(
1380 mj_part_t num_parts,
1381 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1382 Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
1383 mj_lno_t coordinate_begin,
1384 mj_lno_t coordinate_end,
1385 Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
1386 Kokkos::View<mj_lno_t *, device_t> & out_part_xadj,
1388 bool longest_dim_part,
1399 void set_final_parts(
1400 mj_part_t current_num_parts,
1401 mj_part_t output_part_begin_index,
1402 RCP<mj_partBoxVector_t> &output_part_boxes,
1403 bool is_data_ever_migrated);
1408 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
1409 typename mj_part_t,
typename mj_node_t>
1411 mj_env(), mj_problemComm(), comm(), imbalance_tolerance(0),
1412 recursion_depth(0), coord_dim(0),
1413 num_weights_per_coord(0), initial_num_loc_coords(0),
1414 initial_num_glob_coords(0),
1415 num_local_coords(0), num_global_coords(0),
1416 sEpsilon(std::numeric_limits<mj_scalar_t>::
epsilon() * 100),
1417 distribute_points_on_cut_lines(true),
1418 max_concurrent_part_calculation(1),
1419 mj_run_as_rcb(false), mj_user_recursion_depth(0),
1420 mj_keep_part_boxes(false),
1421 check_migrate_avoid_migration_option(0), migration_type(0),
1422 minimum_migration_imbalance(0.30),
1423 num_first_level_parts(1),
1424 total_num_cut(0), total_num_part(0), max_num_part_along_dim(0),
1425 max_num_cut_along_dim(0),
1426 max_num_total_part_along_dim(0),
1427 total_dim_num_reduce_all(0),
1428 last_dim_num_part(0),
1430 num_global_parts(1),
1431 kept_boxes(), global_box(),
1432 myRank(0), myActualRank(0),
1433 divide_to_prime_first(false)
1480 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
1481 typename mj_part_t,
typename mj_node_t>
1484 const RCP<const Environment> &env,
1485 mj_lno_t num_total_coords,
1486 mj_lno_t num_selected_coords,
1487 size_t num_target_part,
1490 Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t> &
1492 Kokkos::View<mj_lno_t *, device_t> & initial_adjList_output_adjlist,
1493 mj_lno_t *output_xadj,
1494 int recursion_depth_,
1495 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & part_no_array_,
1496 bool partition_along_longest_dim,
1497 int num_ranks_per_node,
1498 bool divide_to_prime_first_,
1499 mj_part_t num_first_level_parts_,
1500 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & first_level_distribution_)
1503 const RCP<Comm<int> > commN;
1504 this->mj_problemComm = Teuchos::DefaultComm<int>::getDefaultSerialComm(commN);
1505 this->comm = Teuchos::rcp_const_cast<Comm<int> >(this->mj_problemComm);
1506 this->myActualRank = this->myRank = 1;
1508 this->divide_to_prime_first = divide_to_prime_first_;
1513 this->imbalance_tolerance = 0;
1514 this->num_global_parts = num_target_part;
1515 this->part_no_array = part_no_array_;
1516 this->recursion_depth = recursion_depth_;
1520 this->num_first_level_parts = num_first_level_parts_;
1522 this->first_level_distribution = first_level_distribution_;
1524 this->coord_dim = coord_dim_;
1525 this->num_local_coords = num_total_coords;
1527 this->num_global_coords = num_total_coords;
1528 this->mj_coordinates = mj_coordinates_;
1531 this->initial_mj_gnos =
1532 Kokkos::View<mj_gno_t*, device_t>(
"gids", this->num_local_coords);
1534 this->num_weights_per_coord = 0;
1536 this->mj_uniform_weights = Kokkos::View<bool*, Kokkos::HostSpace>(
1537 "uniform weights", 1);
1538 this->mj_uniform_weights(0) =
true;
1540 this->mj_weights = Kokkos::View<mj_scalar_t**, device_t>
1543 this->mj_uniform_parts =
1544 Kokkos::View<bool*, Kokkos::HostSpace>(
"uniform parts", 1);
1545 this->mj_uniform_parts(0) =
true;
1547 this->set_part_specifications();
1549 this->allocate_set_work_memory();
1552 auto local_part_xadj = this->part_xadj;
1553 Kokkos::parallel_for(
1554 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
1555 KOKKOS_LAMBDA (
int dummy) {
1556 local_part_xadj(0) =
static_cast<mj_lno_t
>(num_selected_coords);
1559 Kokkos::deep_copy(coordinate_permutations, initial_adjList_output_adjlist);
1561 mj_part_t current_num_parts = 1;
1563 Kokkos::View<mj_scalar_t *, device_t> current_cut_coordinates =
1564 this->all_cut_coordinates;
1566 mj_part_t future_num_parts = this->total_num_part;
1568 std::vector<mj_part_t> *future_num_part_in_parts =
1569 new std::vector<mj_part_t>();
1570 std::vector<mj_part_t> *next_future_num_parts_in_parts =
1571 new std::vector<mj_part_t>();
1572 next_future_num_parts_in_parts->push_back(this->num_global_parts);
1573 RCP<mj_partBoxVector_t> t1;
1574 RCP<mj_partBoxVector_t> t2;
1576 std::vector <uSignedSortItem<int, mj_scalar_t, char>>
1577 coord_dimension_range_sorted(this->coord_dim);
1579 &(coord_dimension_range_sorted[0]);
1580 std::vector <mj_scalar_t> coord_dim_mins(this->coord_dim);
1581 std::vector <mj_scalar_t> coord_dim_maxs(this->coord_dim);
1585 Kokkos::View<mj_part_t*, device_t>
1586 view_rectilinear_cut_count(
"view_rectilinear_cut_count", 1);
1587 Kokkos::View<size_t*, device_t>
1588 view_total_reduction_size(
"view_total_reduction_size", 1);
1590 for(
int rd = 0; rd < this->recursion_depth; ++rd) {
1596 std::vector<mj_part_t> *tmpPartVect = future_num_part_in_parts;
1597 future_num_part_in_parts = next_future_num_parts_in_parts;
1598 next_future_num_parts_in_parts = tmpPartVect;
1602 next_future_num_parts_in_parts->clear();
1605 mj_part_t output_part_count_in_dimension =
1606 this->update_part_num_arrays(
1607 future_num_part_in_parts,
1608 next_future_num_parts_in_parts,
1613 t2, num_ranks_per_node);
1618 if(output_part_count_in_dimension == current_num_parts) {
1619 tmpPartVect = future_num_part_in_parts;
1620 future_num_part_in_parts = next_future_num_parts_in_parts;
1621 next_future_num_parts_in_parts = tmpPartVect;
1626 std::string istring = std::to_string(rd);
1630 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
1631 "new part xadj", output_part_count_in_dimension);
1635 mj_part_t output_part_index = 0;
1639 mj_part_t output_coordinate_end_index = 0;
1641 mj_part_t current_work_part = 0;
1642 mj_part_t current_concurrent_num_parts = 1;
1644 mj_part_t obtained_part_index = 0;
1647 int coordInd = rd % this->coord_dim;
1649 Kokkos::View<mj_scalar_t *, device_t> mj_current_dim_coords =
1650 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
1652 auto host_process_local_min_max_coord_total_weight =
1653 Kokkos::create_mirror_view(process_local_min_max_coord_total_weight);
1654 auto host_global_min_max_coord_total_weight =
1655 Kokkos::create_mirror_view(global_min_max_coord_total_weight);
1658 for(; current_work_part < current_num_parts;
1659 current_work_part += current_concurrent_num_parts) {
1661 mj_part_t actual_work_part_count = 0;
1666 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk) {
1667 mj_part_t current_work_part_in_concurrent_parts =
1668 current_work_part + kk;
1672 mj_part_t partition_count = host_num_partitioning_in_current_dim(
1673 current_work_part_in_concurrent_parts);
1674 if(partition_count == 1) {
1677 ++actual_work_part_count;
1678 if(partition_along_longest_dim) {
1679 auto local_process_local_min_max_coord_total_weight =
1680 this->process_local_min_max_coord_total_weight;
1681 for(
int coord_traverse_ind = 0;
1682 coord_traverse_ind < this->coord_dim; ++coord_traverse_ind) {
1684 Kokkos::View<mj_scalar_t *, device_t> coords =
1685 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coord_traverse_ind);
1687 this->mj_get_local_min_max_coord_totW(
1689 current_concurrent_num_parts,
1692 coord_dimension_range_sorted[coord_traverse_ind].id =
1694 coord_dimension_range_sorted[coord_traverse_ind].signbit = 1;
1696 Kokkos::deep_copy(host_process_local_min_max_coord_total_weight,
1697 process_local_min_max_coord_total_weight);
1699 coord_dim_mins[coord_traverse_ind] =
1700 host_process_local_min_max_coord_total_weight(kk);
1701 coord_dim_maxs[coord_traverse_ind] =
1702 host_process_local_min_max_coord_total_weight(
1703 kk + current_concurrent_num_parts);
1704 coord_dimension_range_sorted[coord_traverse_ind].val =
1705 host_process_local_min_max_coord_total_weight(
1706 kk + current_concurrent_num_parts) -
1707 host_process_local_min_max_coord_total_weight(kk);
1710 uqSignsort(this->coord_dim, p_coord_dimension_range_sorted);
1711 coordInd = p_coord_dimension_range_sorted[this->coord_dim - 1].
id;
1712 auto set_min = coord_dim_mins[coordInd];
1713 auto set_max = coord_dim_maxs[coordInd];
1714 Kokkos::parallel_for(
1715 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
1716 (0, 1), KOKKOS_LAMBDA (
int dummy) {
1717 local_process_local_min_max_coord_total_weight(kk) = set_min;
1718 local_process_local_min_max_coord_total_weight(
1719 kk + current_concurrent_num_parts) = set_max;
1722 mj_current_dim_coords =
1723 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
1726 Kokkos::View<mj_scalar_t *, device_t> coords =
1727 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
1728 this->mj_get_local_min_max_coord_totW(
1730 current_concurrent_num_parts,
1736 if(actual_work_part_count > 0) {
1738 this->mj_get_global_min_max_coord_totW(
1739 current_concurrent_num_parts,
1740 this->process_local_min_max_coord_total_weight,
1741 this->global_min_max_coord_total_weight);
1744 Kokkos::deep_copy(host_global_min_max_coord_total_weight,
1745 global_min_max_coord_total_weight);
1749 mj_part_t total_incomplete_cut_count = 0;
1754 mj_part_t concurrent_part_cut_shift = 0;
1755 mj_part_t concurrent_part_part_shift = 0;
1756 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk) {
1757 mj_scalar_t min_coordinate =
1758 host_global_min_max_coord_total_weight(kk);
1759 mj_scalar_t max_coordinate = host_global_min_max_coord_total_weight(
1760 kk + current_concurrent_num_parts);
1761 mj_scalar_t global_total_weight = host_global_min_max_coord_total_weight(
1762 kk + 2*current_concurrent_num_parts);
1764 mj_part_t concurrent_current_part_index = current_work_part + kk;
1766 mj_part_t partition_count = host_num_partitioning_in_current_dim(
1767 concurrent_current_part_index);
1769 Kokkos::View<mj_scalar_t *, device_t> usedCutCoordinate =
1770 Kokkos::subview(current_cut_coordinates,
1771 std::pair<mj_lno_t, mj_lno_t>(
1772 concurrent_part_cut_shift,
1773 current_cut_coordinates.size()));
1774 Kokkos::View<mj_scalar_t *, device_t>
1775 current_target_part_weights =
1776 Kokkos::subview(target_part_weights,
1777 std::pair<mj_lno_t, mj_lno_t>(
1778 concurrent_part_part_shift,
1779 target_part_weights.size()));
1782 concurrent_part_cut_shift += partition_count - 1;
1784 concurrent_part_part_shift += partition_count;
1787 if(partition_count > 1 && min_coordinate <= max_coordinate) {
1790 total_incomplete_cut_count += partition_count - 1;
1792 this->incomplete_cut_count(kk) = partition_count - 1;
1800 this->mj_get_initial_cut_coords_target_weights(
1803 partition_count - 1,
1804 global_total_weight,
1806 current_target_part_weights,
1807 future_num_part_in_parts,
1808 next_future_num_parts_in_parts,
1809 concurrent_current_part_index,
1810 obtained_part_index,
1811 rd == 0 ? this->num_first_level_parts : 1,
1812 this->first_level_distribution);
1814 mj_lno_t coordinate_end_index =
1815 host_part_xadj(concurrent_current_part_index);
1816 mj_lno_t coordinate_begin_index =
1817 (concurrent_current_part_index==0) ? 0 :
1818 host_part_xadj[concurrent_current_part_index - 1];
1821 this->set_initial_coordinate_parts(
1824 coordinate_begin_index, coordinate_end_index,
1825 this->coordinate_permutations,
1826 mj_current_dim_coords,
1827 this->assigned_part_ids,
1833 this->incomplete_cut_count(kk) = 0;
1835 obtained_part_index += partition_count;
1840 double used_imbalance = 0;
1844 mj_timer_base_string +
"mj_1D_part()");
1847 mj_current_dim_coords,
1850 current_concurrent_num_parts,
1851 current_cut_coordinates,
1852 total_incomplete_cut_count,
1853 view_rectilinear_cut_count,
1854 view_total_reduction_size);
1857 mj_timer_base_string +
"mj_1D_part()");
1860 obtained_part_index += current_concurrent_num_parts;
1864 mj_part_t output_array_shift = 0;
1865 mj_part_t cut_shift = 0;
1866 size_t tlr_shift = 0;
1867 size_t partweight_array_shift = 0;
1869 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk) {
1870 mj_part_t current_concurrent_work_part = current_work_part + kk;
1872 mj_part_t num_parts = host_num_partitioning_in_current_dim(
1873 current_concurrent_work_part);
1876 int coordinateA_bigger_than_coordinateB =
1877 host_global_min_max_coord_total_weight(kk) >
1878 host_global_min_max_coord_total_weight(
1879 kk + current_concurrent_num_parts);
1881 if((num_parts != 1) && coordinateA_bigger_than_coordinateB) {
1884 auto local_new_part_xadj = this->new_part_xadj;
1885 Kokkos::parallel_for(
1886 Kokkos::RangePolicy<
typename mj_node_t::execution_space,
1887 mj_part_t> (0, num_parts), KOKKOS_LAMBDA(mj_part_t jj) {
1888 local_new_part_xadj(
1889 output_part_index + output_array_shift + jj) = 0;
1892 cut_shift += num_parts - 1;
1893 tlr_shift += (4 *(num_parts - 1) + 1);
1894 output_array_shift += num_parts;
1895 partweight_array_shift += (2 * (num_parts - 1) + 1);
1898 mj_lno_t coordinate_end =
1899 host_part_xadj(current_concurrent_work_part);
1900 mj_lno_t coordinate_begin =
1901 current_concurrent_work_part==0 ? 0 :
1902 host_part_xadj(current_concurrent_work_part-1);
1904 Kokkos::View<mj_scalar_t *, device_t>
1905 current_concurrent_cut_coordinate =
1906 Kokkos::subview(current_cut_coordinates,
1907 std::pair<mj_lno_t, mj_lno_t>(
1909 current_cut_coordinates.size()));
1910 Kokkos::View<mj_scalar_t *, device_t>
1911 used_local_cut_line_weight_to_left =
1912 Kokkos::subview(process_cut_line_weight_to_put_left,
1913 std::pair<mj_lno_t, mj_lno_t>(
1915 process_cut_line_weight_to_put_left.size()));
1917 this->thread_part_weight_work =
1919 this->thread_part_weights,
1920 std::pair<mj_lno_t, mj_lno_t>(
1921 partweight_array_shift,
1922 this->thread_part_weights.size()));
1926 Kokkos::View<mj_lno_t *, device_t> subview_new_part_xadj =
1927 Kokkos::subview(this->new_part_xadj,
1928 std::pair<mj_lno_t, mj_lno_t>(
1929 output_part_index + output_array_shift,
1930 this->new_part_xadj.size()));
1932 this->create_consistent_chunks(
1934 mj_current_dim_coords,
1935 current_concurrent_cut_coordinate,
1938 used_local_cut_line_weight_to_left,
1939 subview_new_part_xadj,
1941 partition_along_longest_dim,
1942 p_coord_dimension_range_sorted);
1947 mj_lno_t part_size = coordinate_end - coordinate_begin;
1949 auto local_new_part_xadj = this->new_part_xadj;
1950 Kokkos::parallel_for(
1951 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
1952 (0, 1), KOKKOS_LAMBDA (
int dummy) {
1953 local_new_part_xadj(output_part_index + output_array_shift)
1957 auto subview_new_coordinate_permutations =
1958 Kokkos::subview(this->new_coordinate_permutations,
1959 std::pair<mj_lno_t, mj_lno_t>(
1961 coordinate_begin + part_size));
1962 auto subview_coordinate_permutations =
1963 Kokkos::subview(this->coordinate_permutations,
1964 std::pair<mj_lno_t, mj_lno_t>(
1966 coordinate_begin + part_size));
1967 Kokkos::deep_copy(subview_new_coordinate_permutations,
1968 subview_coordinate_permutations);
1971 cut_shift += num_parts - 1;
1972 tlr_shift += (4 *(num_parts - 1) + 1);
1973 output_array_shift += num_parts;
1974 partweight_array_shift += (2 * (num_parts - 1) + 1);
1983 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
1984 mj_part_t num_parts =
1985 host_num_partitioning_in_current_dim(current_work_part + kk);
1986 auto local_new_part_xadj = this->new_part_xadj;
1987 auto local_mj_current_dim_coords = mj_current_dim_coords;
1988 auto local_new_coordinate_permutations =
1989 new_coordinate_permutations;
1990 Kokkos::parallel_for(
1991 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t> (
1992 0, num_parts), KOKKOS_LAMBDA (mj_part_t ii) {
1994 local_new_part_xadj(output_part_index+ii) +=
1995 output_coordinate_end_index;
1998 mj_lno_t coordinate_end =
1999 local_new_part_xadj(output_part_index+ii);
2000 mj_lno_t coordinate_begin =
2001 local_new_part_xadj(output_part_index);
2003 for(mj_lno_t task_traverse = coordinate_begin;
2004 task_traverse < coordinate_end; ++task_traverse) {
2005 mj_lno_t l = local_new_coordinate_permutations(task_traverse);
2007 local_mj_current_dim_coords(l) = -local_mj_current_dim_coords(l);
2013 mj_part_t get_single;
2014 Kokkos::parallel_reduce(
"Read new_part_xadj",
2015 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>(0, 1),
2016 KOKKOS_LAMBDA(
int dummy, mj_part_t & set_single) {
2017 set_single = local_new_part_xadj(output_part_index + num_parts - 1);
2020 output_coordinate_end_index = get_single;
2022 output_part_index += num_parts;
2029 current_num_parts = output_part_count_in_dimension;
2032 Kokkos::View<mj_lno_t *, device_t> tmp = this->coordinate_permutations;
2033 this->coordinate_permutations = this->new_coordinate_permutations;
2034 this->new_coordinate_permutations = tmp;
2036 this->part_xadj = this->new_part_xadj;
2037 this->host_part_xadj = Kokkos::create_mirror_view(part_xadj);
2038 Kokkos::deep_copy(host_part_xadj, part_xadj);
2039 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
"empty", 0);
2042 Kokkos::deep_copy(initial_adjList_output_adjlist, coordinate_permutations);
2046 for(
size_t i = 0; i < this->num_global_parts ; ++i) {
2047 output_xadj[i+1] = host_part_xadj(i);
2050 delete future_num_part_in_parts;
2051 delete next_future_num_parts_in_parts;
2057 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2058 typename mj_part_t,
typename mj_node_t>
2060 <mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t,mj_node_t>::mj_partBox_t>
2064 return this->global_box;
2069 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2070 typename mj_part_t,
typename mj_node_t>
2071 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
2072 mj_node_t>::set_to_keep_part_boxes()
2074 this->mj_keep_part_boxes =
true;
2084 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2085 typename mj_part_t,
typename mj_node_t>
2089 this->total_num_cut = 0;
2090 this->total_num_part = 1;
2091 this->max_num_part_along_dim = 0;
2092 this->total_dim_num_reduce_all = 0;
2093 this->last_dim_num_part = 1;
2096 this->max_num_cut_along_dim = 0;
2097 this->max_num_total_part_along_dim = 0;
2099 if(this->part_no_array.size()) {
2100 auto local_recursion_depth = this->recursion_depth;
2102 this->total_dim_num_reduce_all =
2103 this->total_num_part * this->recursion_depth;
2105 this->total_num_part = 1;
2106 for(
int i = 0; i < local_recursion_depth; ++i) {
2107 this->total_num_part *= this->part_no_array(i);
2110 mj_part_t track_max = 0;
2111 for(
int i = 0; i < local_recursion_depth; ++i) {
2112 if(part_no_array(i) > track_max) {
2113 track_max = this->part_no_array(i);
2117 this->last_dim_num_part = this->total_num_part /
2118 this->part_no_array(local_recursion_depth-1);
2120 this->max_num_part_along_dim = track_max;
2121 this->num_global_parts = this->total_num_part;
2123 mj_part_t future_num_parts = this->num_global_parts;
2127 if (this->first_level_distribution.size() != 0 &&
2128 this->num_first_level_parts > 1) {
2129 this->max_num_part_along_dim = this->num_first_level_parts;
2134 for(
int rd = 0; rd < this->recursion_depth; ++rd) {
2135 mj_part_t maxNoPartAlongI = 0;
2136 mj_part_t nfutureNumParts = 0;
2142 this->first_level_distribution.size() != 0 &&
2143 this->num_first_level_parts > 1) {
2145 maxNoPartAlongI = this->num_first_level_parts;
2146 this->max_num_part_along_dim = this->num_first_level_parts;
2148 mj_part_t sum_first_level_dist = 0;
2149 mj_part_t max_part = 0;
2152 for (
int i = 0; i < this->num_first_level_parts; ++i) {
2153 sum_first_level_dist += this->first_level_distribution(i);
2154 if (this->first_level_distribution(i) > max_part)
2155 max_part = this->first_level_distribution(i);
2161 this->num_global_parts * max_part / sum_first_level_dist;
2165 maxNoPartAlongI = this->get_part_count(future_num_parts,
2166 1.0f / (this->recursion_depth - rd));
2167 if (maxNoPartAlongI > this->max_num_part_along_dim)
2168 this->max_num_part_along_dim = maxNoPartAlongI;
2169 nfutureNumParts = future_num_parts / maxNoPartAlongI;
2170 if (future_num_parts % maxNoPartAlongI) {
2174 future_num_parts = nfutureNumParts;
2176 this->total_num_part = this->num_global_parts;
2178 if(this->divide_to_prime_first) {
2179 this->total_dim_num_reduce_all = this->num_global_parts * 2;
2180 this->last_dim_num_part = this->num_global_parts;
2187 for(
int i = 0; i < this->recursion_depth; ++i) {
2188 this->total_dim_num_reduce_all += p;
2189 p *= this->max_num_part_along_dim;
2192 if(p / this->max_num_part_along_dim > this->num_global_parts) {
2193 this->last_dim_num_part = this->num_global_parts;
2196 this->last_dim_num_part = p / this->max_num_part_along_dim;
2201 this->total_num_cut = this->total_num_part - 1;
2202 this->max_num_cut_along_dim = this->max_num_part_along_dim - 1;
2203 this->max_num_total_part_along_dim = this->max_num_part_along_dim +
2204 size_t(this->max_num_cut_along_dim);
2209 if(this->max_concurrent_part_calculation > this->last_dim_num_part) {
2210 if(this->mj_problemComm->getRank() == 0) {
2211 std::cerr <<
"Warning: Concurrent part count (" <<
2212 this->max_concurrent_part_calculation <<
2213 ") has been set bigger than maximum amount that can be used." <<
2214 " Setting to:" << this->last_dim_num_part <<
"." << std::endl;
2216 this->max_concurrent_part_calculation = this->last_dim_num_part;
2225 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2226 typename mj_part_t,
typename mj_node_t>
2227 inline mj_part_t AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
2228 get_part_count(mj_part_t num_total_future,
double root)
2230 double fp = pow(num_total_future, root);
2231 mj_part_t ip = mj_part_t(fp);
2259 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2260 typename mj_part_t,
typename mj_node_t>
2261 mj_part_t AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
2262 update_part_num_arrays(
2263 std::vector<mj_part_t> *future_num_part_in_parts,
2264 std::vector<mj_part_t> *next_future_num_parts_in_parts,
2265 mj_part_t &future_num_parts,
2266 mj_part_t current_num_parts,
2267 int current_iteration,
2268 RCP<mj_partBoxVector_t> input_part_boxes,
2269 RCP<mj_partBoxVector_t> output_part_boxes,
2270 mj_part_t atomic_part_count)
2272 std::vector<mj_part_t> num_partitioning_in_current_dim;
2275 mj_part_t output_num_parts = 0;
2276 if(this->part_no_array.size()) {
2280 mj_part_t current_part_no_array =
2281 this->part_no_array(current_iteration);
2283 if(current_part_no_array < 1) {
2284 std::cout <<
"Current recursive iteration: " << current_iteration <<
2285 " part_no_array[" << current_iteration <<
"] is given as:" <<
2286 current_part_no_array << std::endl;
2289 if(current_part_no_array == 1) {
2290 return current_num_parts;
2294 if (this->first_level_distribution.size() != 0 &&
2295 current_iteration == 0 &&
2296 current_part_no_array != this->num_first_level_parts) {
2297 std::cout <<
"Current recursive iteration: " << current_iteration
2298 <<
" part_no_array[" << current_iteration <<
"] is given as: " <<
2299 current_part_no_array <<
" and contradicts num_first_level_parts: " <<
2300 this->num_first_level_parts << std::endl;
2304 for(mj_part_t ii = 0; ii < current_num_parts; ++ii) {
2305 num_partitioning_in_current_dim.push_back(current_part_no_array);
2322 future_num_parts /= num_partitioning_in_current_dim[0];
2323 output_num_parts = current_num_parts *
2324 num_partitioning_in_current_dim[0];
2325 if(this->mj_keep_part_boxes) {
2326 for(mj_part_t k = 0; k < current_num_parts; ++k) {
2328 for(mj_part_t j = 0; j <
2329 num_partitioning_in_current_dim[0]; ++j) {
2330 output_part_boxes->push_back((*input_part_boxes)[k]);
2338 for(mj_part_t ii = 0; ii < output_num_parts; ++ii) {
2339 next_future_num_parts_in_parts->push_back(future_num_parts);
2349 future_num_parts = 1;
2352 for(mj_part_t ii = 0; ii < current_num_parts; ++ii) {
2354 mj_part_t future_num_parts_of_part_ii = (*future_num_part_in_parts)[ii];
2358 mj_part_t num_partitions_in_current_dim =
2359 this->get_part_count(future_num_parts_of_part_ii,
2360 1.0 / (this->recursion_depth - current_iteration)
2362 if(num_partitions_in_current_dim > this->max_num_part_along_dim) {
2363 std::cerr <<
"ERROR: maxPartNo calculation is wrong."
2364 " num_partitions_in_current_dim: "
2365 << num_partitions_in_current_dim <<
" this->max_num_part_along_dim: "
2366 << this->max_num_part_along_dim <<
2367 " this->recursion_depth: " << this->recursion_depth <<
2368 " current_iteration:" << current_iteration <<
2369 " future_num_parts_of_part_ii: " << future_num_parts_of_part_ii <<
2370 " might need to fix max part no calculation for "
2371 "largest_prime_first partitioning." <<
2383 if (current_iteration == 0 &&
2384 this->first_level_distribution.size() != 0 &&
2385 this->num_first_level_parts > 1) {
2388 num_partitioning_in_current_dim.push_back(this->num_first_level_parts);
2391 output_num_parts = this->num_first_level_parts;
2394 future_num_parts /= this->num_first_level_parts;
2396 mj_part_t max_part = 0;
2397 mj_part_t sum_first_level_dist = 0;
2401 for (
int i = 0; i < this->num_first_level_parts; ++i) {
2402 sum_first_level_dist += this->first_level_distribution(i);
2404 if (this->first_level_distribution(i) > max_part)
2405 max_part = this->first_level_distribution(i);
2409 future_num_parts = this->num_global_parts * max_part / sum_first_level_dist;
2413 for (
int i = 0; i < this->num_first_level_parts; ++i) {
2414 next_future_num_parts_in_parts->push_back(this->first_level_distribution(i) *
2415 this->num_global_parts / sum_first_level_dist);
2418 else if (this->divide_to_prime_first) {
2420 num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
2422 mj_part_t largest_prime_factor = num_partitions_in_current_dim;
2425 output_num_parts += num_partitions_in_current_dim;
2427 if (future_num_parts_of_part_ii == atomic_part_count ||
2428 future_num_parts_of_part_ii % atomic_part_count != 0) {
2429 atomic_part_count = 1;
2432 largest_prime_factor =
2433 this->find_largest_prime_factor(future_num_parts_of_part_ii / atomic_part_count);
2440 if (largest_prime_factor < num_partitions_in_current_dim) {
2441 largest_prime_factor = num_partitions_in_current_dim;
2444 mj_part_t ideal_num_future_parts_in_part =
2445 (future_num_parts_of_part_ii / atomic_part_count) / largest_prime_factor;
2447 mj_part_t ideal_prime_scale = largest_prime_factor / num_partitions_in_current_dim;
2455 for (mj_part_t iii = 0; iii < num_partitions_in_current_dim; ++iii) {
2457 mj_part_t my_ideal_primescale = ideal_prime_scale;
2459 if (iii < (largest_prime_factor) % num_partitions_in_current_dim) {
2460 ++my_ideal_primescale;
2463 mj_part_t num_future_parts_for_part_iii =
2464 ideal_num_future_parts_in_part * my_ideal_primescale;
2467 if (iii < (future_num_parts_of_part_ii / atomic_part_count) % largest_prime_factor) {
2469 ++num_future_parts_for_part_iii;
2472 next_future_num_parts_in_parts->push_back(num_future_parts_for_part_iii * atomic_part_count);
2475 if (this->mj_keep_part_boxes) {
2476 output_part_boxes->push_back((*input_part_boxes)[ii]);
2480 if (num_future_parts_for_part_iii > future_num_parts)
2481 future_num_parts = num_future_parts_for_part_iii;
2487 num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
2490 output_num_parts += num_partitions_in_current_dim;
2492 if((future_num_parts_of_part_ii == atomic_part_count) ||
2493 (future_num_parts_of_part_ii % atomic_part_count != 0)) {
2494 atomic_part_count = 1;
2497 mj_part_t ideal_num_future_parts_in_part =
2498 (future_num_parts_of_part_ii / atomic_part_count) /
2499 num_partitions_in_current_dim;
2500 for(mj_part_t iii = 0; iii < num_partitions_in_current_dim; ++iii) {
2501 mj_part_t num_future_parts_for_part_iii =
2502 ideal_num_future_parts_in_part;
2505 if(iii < (future_num_parts_of_part_ii / atomic_part_count) %
2506 num_partitions_in_current_dim) {
2508 ++num_future_parts_for_part_iii;
2511 next_future_num_parts_in_parts->push_back(
2512 num_future_parts_for_part_iii * atomic_part_count);
2516 if(this->mj_keep_part_boxes) {
2517 output_part_boxes->push_back((*input_part_boxes)[ii]);
2520 if(num_future_parts_for_part_iii > future_num_parts)
2521 future_num_parts = num_future_parts_for_part_iii;
2527 device_num_partitioning_in_current_dim = Kokkos::View<
2528 mj_part_t*,
device_t>(
"test", num_partitioning_in_current_dim.size());
2529 host_num_partitioning_in_current_dim =
2530 Kokkos::create_mirror_view(device_num_partitioning_in_current_dim);
2531 for(
size_t n = 0; n < num_partitioning_in_current_dim.size(); ++n) {
2532 host_num_partitioning_in_current_dim(n) =
2533 num_partitioning_in_current_dim[n];
2538 Kokkos::deep_copy(device_num_partitioning_in_current_dim,
2539 host_num_partitioning_in_current_dim);
2540 return output_num_parts;
2545 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2546 typename mj_part_t,
typename mj_node_t>
2547 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
2548 allocate_set_work_memory()
2553 this->coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>(
2554 Kokkos::ViewAllocateWithoutInitializing(
"coordinate_permutations"),
2555 this->num_local_coords);
2556 auto local_coordinate_permutations = coordinate_permutations;
2557 Kokkos::parallel_for(
2558 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t> (
2559 0, this->num_local_coords), KOKKOS_LAMBDA (mj_lno_t i) {
2560 local_coordinate_permutations(i) = i;
2564 this->new_coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>(
2565 Kokkos::ViewAllocateWithoutInitializing(
"num_local_coords"),
2566 this->num_local_coords);
2568 this->assigned_part_ids = Kokkos::View<mj_part_t*, device_t>(
2569 Kokkos::ViewAllocateWithoutInitializing(
"assigned parts"), 0);
2570 if(this->num_local_coords > 0) {
2571 this->assigned_part_ids = Kokkos::View<mj_part_t*, device_t>(
2572 Kokkos::ViewAllocateWithoutInitializing(
"assigned part ids"),
2573 this->num_local_coords);
2580 this->part_xadj = Kokkos::View<mj_lno_t*, device_t>(
2581 Kokkos::ViewAllocateWithoutInitializing(
"part xadj"), 1);
2582 this->host_part_xadj = Kokkos::create_mirror_view(part_xadj);
2583 host_part_xadj(0) = num_local_coords;
2584 Kokkos::deep_copy(this->part_xadj, host_part_xadj);
2587 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
2588 Kokkos::ViewAllocateWithoutInitializing(
"empty"), 0);
2591 this->all_cut_coordinates = Kokkos::View<mj_scalar_t*, device_t>(
2592 Kokkos::ViewAllocateWithoutInitializing(
"all cut coordinates"),
2593 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2596 this->process_cut_line_weight_to_put_left = Kokkos::View<mj_scalar_t*,
2597 device_t>(Kokkos::ViewAllocateWithoutInitializing(
"empty"), 0);
2601 this->thread_cut_line_weight_to_put_left =
2602 Kokkos::View<mj_scalar_t*, device_t>(
2603 Kokkos::ViewAllocateWithoutInitializing(
"empty"), 0);
2605 if(this->distribute_points_on_cut_lines) {
2606 this->process_cut_line_weight_to_put_left =
2607 Kokkos::View<mj_scalar_t *, device_t>(
2608 Kokkos::ViewAllocateWithoutInitializing(
2609 "process_cut_line_weight_to_put_left"),
2610 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2611 this->thread_cut_line_weight_to_put_left =
2612 Kokkos::View<mj_scalar_t *, device_t>(
2613 Kokkos::ViewAllocateWithoutInitializing(
2614 "thread_cut_line_weight_to_put_left"),
2615 this->max_num_cut_along_dim);
2616 this->process_rectilinear_cut_weight =
2617 Kokkos::View<mj_scalar_t *, device_t>(
2618 Kokkos::ViewAllocateWithoutInitializing(
"process_rectilinear_cut_weight"),
2619 this->max_num_cut_along_dim);
2620 this->global_rectilinear_cut_weight =
2621 Kokkos::View<mj_scalar_t *, device_t>(
2622 Kokkos::ViewAllocateWithoutInitializing(
"global_rectilinear_cut_weight"),
2623 this->max_num_cut_along_dim);
2630 this->cut_coordinates_work_array =
2631 Kokkos::View<mj_scalar_t *, device_t>(
2632 Kokkos::ViewAllocateWithoutInitializing(
"cut_coordinates_work_array"),
2633 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2636 this->target_part_weights = Kokkos::View<mj_scalar_t*, device_t>(
2637 Kokkos::ViewAllocateWithoutInitializing(
"target_part_weights"),
2638 this->max_num_part_along_dim * this->max_concurrent_part_calculation);
2641 this->cut_upper_bound_coordinates =
2642 Kokkos::View<mj_scalar_t*, device_t>(
2643 Kokkos::ViewAllocateWithoutInitializing(
"cut_upper_bound_coordinates"),
2644 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2647 this->cut_lower_bound_coordinates =
2648 Kokkos::View<mj_scalar_t*, device_t>(
2649 Kokkos::ViewAllocateWithoutInitializing(
"cut_lower_bound_coordinates"),
2650 this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2653 this->cut_lower_bound_weights =
2654 Kokkos::View<mj_scalar_t*, device_t>(
2655 Kokkos::ViewAllocateWithoutInitializing(
"cut_lower_bound_weights"),
2656 this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2659 this->cut_upper_bound_weights =
2660 Kokkos::View<mj_scalar_t*, device_t>(
2661 Kokkos::ViewAllocateWithoutInitializing(
"cut_upper_bound_weights"),
2662 this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2666 this->process_local_min_max_coord_total_weight =
2667 Kokkos::View<mj_scalar_t*, device_t>(
2668 Kokkos::ViewAllocateWithoutInitializing(
2669 "process_local_min_max_coord_total_weight"),
2670 3 * this->max_concurrent_part_calculation);
2673 this->global_min_max_coord_total_weight =
2674 Kokkos::View<mj_scalar_t*, device_t>(
2675 Kokkos::ViewAllocateWithoutInitializing(
"global_min_max_coord_total_weight"),
2676 3 * this->max_concurrent_part_calculation);
2681 this->is_cut_line_determined = Kokkos::View<bool *, device_t>(
2682 Kokkos::ViewAllocateWithoutInitializing(
"is_cut_line_determined"),
2683 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2689 this->device_incomplete_cut_count = Kokkos::View<mj_part_t *, device_t>(
2690 Kokkos::ViewAllocateWithoutInitializing(
"device_incomplete_cut_count"),
2691 this->max_concurrent_part_calculation);
2692 this->incomplete_cut_count =
2693 Kokkos::create_mirror_view(device_incomplete_cut_count);
2696 this->thread_part_weights = Kokkos::View<double *, device_t>(
2697 Kokkos::ViewAllocateWithoutInitializing(
"thread_part_weights"),
2698 this->max_num_total_part_along_dim * this->max_concurrent_part_calculation);
2700 this->thread_cut_left_closest_point = Kokkos::View<mj_scalar_t *, device_t>(
2701 Kokkos::ViewAllocateWithoutInitializing(
"thread_cut_left_closest_point"),
2702 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2706 this->thread_cut_right_closest_point = Kokkos::View<mj_scalar_t *, device_t>(
2707 Kokkos::ViewAllocateWithoutInitializing(
"thread_cut_right_closest_point"),
2708 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2711 this->thread_point_counts = Kokkos::View<mj_lno_t *, device_t>(
2712 Kokkos::ViewAllocateWithoutInitializing(
"thread_point_counts"),
2713 this->max_num_part_along_dim);
2719 this->total_part_weight_left_right_closests =
2720 Kokkos::View<mj_scalar_t*, device_t>(
2721 Kokkos::ViewAllocateWithoutInitializing(
2722 "total_part_weight_left_right_closests"),
2723 (this->max_num_total_part_along_dim + this->max_num_cut_along_dim * 2) *
2724 this->max_concurrent_part_calculation);
2726 this->global_total_part_weight_left_right_closests =
2727 Kokkos::View<mj_scalar_t*, device_t>(
2728 Kokkos::ViewAllocateWithoutInitializing(
2729 "global_total_part_weight_left_right_closests"),
2730 (this->max_num_total_part_along_dim +
2731 this->max_num_cut_along_dim * 2) * this->max_concurrent_part_calculation);
2733 this->current_mj_gnos = Kokkos::View<mj_gno_t*, device_t>(
2734 Kokkos::ViewAllocateWithoutInitializing(
"gids"), num_local_coords);
2736 this->owner_of_coordinate = Kokkos::View<int *, Kokkos::HostSpace>(
2737 Kokkos::ViewAllocateWithoutInitializing(
"owner_of_coordinate"),
2743 Kokkos::deep_copy(owner_of_coordinate, myActualRank);
2745 auto local_current_mj_gnos = current_mj_gnos;
2746 auto local_initial_mj_gnos = initial_mj_gnos;
2747 Kokkos::parallel_for(
2748 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2749 (0, num_local_coords), KOKKOS_LAMBDA (mj_lno_t j) {
2750 local_current_mj_gnos(j) = local_initial_mj_gnos(j);
2756 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2757 typename mj_part_t,
typename mj_node_t>
2758 void AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t,
2759 mj_node_t>::compute_global_box()
2762 mj_scalar_t *mins =
new mj_scalar_t[this->coord_dim];
2764 mj_scalar_t *gmins =
new mj_scalar_t[this->coord_dim];
2766 mj_scalar_t *maxs =
new mj_scalar_t[this->coord_dim];
2768 mj_scalar_t *gmaxs =
new mj_scalar_t[this->coord_dim];
2770 auto local_mj_coordinates = this->mj_coordinates;
2774 for(
int i = 0; i < this->coord_dim; ++i) {
2779 for(
int i = 0; i < std::min(this->recursion_depth, this->coord_dim); ++i) {
2780 Kokkos::parallel_reduce(
"MinReduce",
2781 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2782 (0, this->num_local_coords),
2783 KOKKOS_LAMBDA(mj_lno_t j, mj_scalar_t & running_min) {
2784 if(local_mj_coordinates(j,i) < running_min) {
2785 running_min = local_mj_coordinates(j,i);
2787 }, Kokkos::Min<mj_scalar_t>(mins[i]));
2788 Kokkos::parallel_reduce(
"MaxReduce",
2789 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2790 (0, this->num_local_coords),
2791 KOKKOS_LAMBDA(mj_lno_t j, mj_scalar_t & running_max) {
2792 if(local_mj_coordinates(j,i) > running_max) {
2793 running_max = local_mj_coordinates(j,i);
2795 }, Kokkos::Max<mj_scalar_t>(maxs[i]));
2798 reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MIN,
2799 this->coord_dim, mins, gmins
2802 reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MAX,
2803 this->coord_dim, maxs, gmaxs
2807 global_box = rcp(
new mj_partBox_t(0,this->coord_dim,gmins,gmaxs));
2821 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2822 typename mj_part_t,
typename mj_node_t>
2823 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
2824 mj_node_t>::init_part_boxes(
2825 RCP<mj_partBoxVector_t> & initial_partitioning_boxes)
2827 mj_partBox_t tmp_box(*global_box);
2828 initial_partitioning_boxes->push_back(tmp_box);
2835 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2838 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
2839 mj_get_local_min_max_coord_totW(
2840 mj_part_t current_work_part,
2841 mj_part_t current_concurrent_num_parts,
2842 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords)
2844 auto local_coordinate_permutations = this->coordinate_permutations;
2845 auto local_process_local_min_max_coord_total_weight =
2846 this->process_local_min_max_coord_total_weight;
2847 auto local_mj_weights = this->mj_weights;
2849 bool bUniformWeights = mj_uniform_weights(0);
2851 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk) {
2853 mj_part_t concurrent_current_part = current_work_part + kk;
2854 mj_lno_t coordinate_begin_index = concurrent_current_part == 0 ? 0 :
2855 host_part_xadj(concurrent_current_part - 1);
2856 mj_lno_t coordinate_end_index =
2857 host_part_xadj(concurrent_current_part);
2859 mj_scalar_t my_min_coord = 0;
2860 mj_scalar_t my_max_coord = 0;
2861 mj_scalar_t my_total_weight;
2864 if(coordinate_begin_index >= coordinate_end_index)
2866 my_min_coord = std::numeric_limits<mj_scalar_t>::max();
2867 my_max_coord = -std::numeric_limits<mj_scalar_t>::max();
2868 my_total_weight = 0;
2872 Kokkos::parallel_reduce(
"get min",
2873 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2874 (coordinate_begin_index, coordinate_end_index),
2875 KOKKOS_LAMBDA (mj_lno_t j, mj_scalar_t & running_min) {
2876 int i = local_coordinate_permutations(j);
2877 if(mj_current_dim_coords(i) < running_min)
2878 running_min = mj_current_dim_coords(i);
2879 }, Kokkos::Min<mj_scalar_t>(my_min_coord));
2881 Kokkos::parallel_reduce(
"get max",
2882 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2883 (coordinate_begin_index, coordinate_end_index),
2884 KOKKOS_LAMBDA (mj_lno_t j, mj_scalar_t & running_max) {
2885 int i = local_coordinate_permutations(j);
2886 if(mj_current_dim_coords(i) > running_max)
2887 running_max = mj_current_dim_coords(i);
2888 }, Kokkos::Max<mj_scalar_t>(my_max_coord));
2889 if(bUniformWeights) {
2890 my_total_weight = coordinate_end_index - coordinate_begin_index;
2893 my_total_weight = 0;
2894 Kokkos::parallel_reduce(
"get weight",
2895 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2896 (coordinate_begin_index, coordinate_end_index),
2897 KOKKOS_LAMBDA (mj_lno_t j, mj_scalar_t & lsum) {
2898 int i = local_coordinate_permutations(j);
2899 lsum += local_mj_weights(i,0);
2900 }, my_total_weight);
2905 Kokkos::parallel_for(
2906 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
2907 (0, 1), KOKKOS_LAMBDA (
int dummy) {
2908 local_process_local_min_max_coord_total_weight(kk) =
2910 local_process_local_min_max_coord_total_weight(
2911 kk + current_concurrent_num_parts) = my_max_coord;
2912 local_process_local_min_max_coord_total_weight(
2913 kk + 2*current_concurrent_num_parts) = my_total_weight;
2930 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2931 typename mj_part_t,
typename mj_node_t>
2932 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
2933 mj_node_t>::mj_get_global_min_max_coord_totW(
2934 mj_part_t current_concurrent_num_parts,
2935 Kokkos::View<mj_scalar_t *, device_t> & local_min_max_total,
2936 Kokkos::View<mj_scalar_t *, device_t> & global_min_max_total) {
2940 if(this->comm->getSize() > 1) {
2943 auto host_local_min_max_total =
2944 Kokkos::create_mirror_view(Kokkos::HostSpace(), local_min_max_total);
2945 auto host_global_min_max_total =
2946 Kokkos::create_mirror_view(Kokkos::HostSpace(), global_min_max_total);
2947 Kokkos::deep_copy(host_local_min_max_total, local_min_max_total);
2949 reductionOp(current_concurrent_num_parts,
2950 current_concurrent_num_parts, current_concurrent_num_parts);
2952 reduceAll<int, mj_scalar_t>(
2955 3 * current_concurrent_num_parts,
2956 host_local_min_max_total.data(),
2957 host_global_min_max_total.data());
2960 Kokkos::deep_copy(global_min_max_total, host_global_min_max_total);
2963 mj_part_t s = 3 * current_concurrent_num_parts;
2964 Kokkos::parallel_for(
2965 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
2966 (0, s), KOKKOS_LAMBDA (mj_part_t i) {
2967 global_min_max_total(i) = local_min_max_total(i);
3004 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3005 typename mj_part_t,
typename mj_node_t>
3006 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
3007 mj_get_initial_cut_coords_target_weights(
3008 mj_scalar_t min_coord,
3009 mj_scalar_t max_coord,
3010 mj_part_t num_cuts ,
3011 mj_scalar_t global_weight,
3013 Kokkos::View<mj_scalar_t *, device_t> & initial_cut_coords,
3015 Kokkos::View<mj_scalar_t *, device_t> & current_target_part_weights ,
3016 std::vector <mj_part_t> *future_num_part_in_parts,
3017 std::vector <mj_part_t> *next_future_num_parts_in_parts,
3018 mj_part_t concurrent_current_part,
3019 mj_part_t obtained_part_index,
3020 mj_part_t num_target_first_level_parts,
3021 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & target_first_level_dist)
3023 mj_scalar_t coord_range = max_coord - min_coord;
3030 bool bUniformPartsCheck =
3031 num_target_first_level_parts <= 1 && this->mj_uniform_parts(0);
3033 if(!bUniformPartsCheck) {
3034 bool bValidNonUniformTargetWeights =
3035 (num_target_first_level_parts > 1 && target_first_level_dist.size() != 0);
3036 if(!bValidNonUniformTargetWeights) {
3037 std::cerr <<
"MJ does not support non uniform part weights beyond the first partition" << std::endl;
3042 Kokkos::View<mj_scalar_t*, device_t> device_cumulative(
3043 "device_cumulative", num_cuts);
3044 auto host_cumulative = Kokkos::create_mirror_view(device_cumulative);
3046 mj_scalar_t cumulative = 0;
3048 if(bUniformPartsCheck) {
3050 mj_scalar_t total_future_part_count_in_part =
3051 static_cast<mj_scalar_t
>((*future_num_part_in_parts)[concurrent_current_part]);
3054 mj_scalar_t unit_part_weight =
3055 global_weight / total_future_part_count_in_part;
3057 for(mj_part_t i = 0; i < num_cuts; ++i) {
3058 cumulative += unit_part_weight *
static_cast<mj_scalar_t
>((*next_future_num_parts_in_parts)[i + obtained_part_index]);
3059 host_cumulative(i) = cumulative;
3064 mj_scalar_t sum_target_first_level_dist = 0.0;
3065 for (
int i = 0; i < num_target_first_level_parts; ++i) {
3066 sum_target_first_level_dist += target_first_level_dist(i);
3069 for(mj_part_t i = 0; i < num_cuts; ++i) {
3070 cumulative += global_weight * target_first_level_dist(i) /
3071 sum_target_first_level_dist;
3072 host_cumulative(i) = cumulative;
3076 Kokkos::deep_copy(device_cumulative, host_cumulative);
3078 Kokkos::parallel_for(
"Write num in parts",
3079 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
3080 (0, num_cuts), KOKKOS_LAMBDA(mj_part_t cut) {
3082 current_target_part_weights(cut) = device_cumulative(cut);
3083 initial_cut_coords(cut) = min_coord +
3084 (coord_range * device_cumulative(cut)) / global_weight;
3086 current_target_part_weights(num_cuts) = global_weight;
3092 if (!bUniformPartsCheck || this->mj_uniform_weights[0]) {
3093 Kokkos::parallel_for(
3094 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
3096 KOKKOS_LAMBDA (mj_part_t i) {
3097 current_target_part_weights(i) =
3098 long(current_target_part_weights(i) + 0.5);
3119 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3120 typename mj_part_t,
typename mj_node_t>
3121 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
3122 set_initial_coordinate_parts(
3123 mj_scalar_t &max_coordinate,
3124 mj_scalar_t &min_coordinate,
3125 mj_lno_t coordinate_begin_index,
3126 mj_lno_t coordinate_end_index,
3127 Kokkos::View<mj_lno_t *, device_t> & mj_current_coordinate_permutations,
3128 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
3129 Kokkos::View<mj_part_t *, device_t> & mj_part_ids,
3130 mj_part_t &partition_count)
3132 mj_scalar_t coordinate_range = max_coordinate - min_coordinate;
3136 if(std::abs(coordinate_range) < this->sEpsilon ) {
3137 Kokkos::parallel_for(
3138 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
3139 (coordinate_begin_index, coordinate_end_index),
3140 KOKKOS_LAMBDA (mj_lno_t ii) {
3141 mj_part_ids(mj_current_coordinate_permutations[ii]) = 0;
3147 mj_scalar_t slice = coordinate_range / partition_count;
3148 Kokkos::parallel_for(
3149 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
3150 (coordinate_begin_index, coordinate_end_index),
3151 KOKKOS_LAMBDA (mj_lno_t ii) {
3152 mj_lno_t iii = mj_current_coordinate_permutations[ii];
3154 mj_part_t((mj_current_dim_coords[iii] - min_coordinate) / slice);
3155 if(pp >= partition_count) {
3156 pp = partition_count - 1;
3158 mj_part_ids[iii] = 2 * pp;
3177 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3178 typename mj_part_t,
typename mj_node_t>
3179 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,mj_node_t>::mj_1D_part(
3180 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
3181 double used_imbalance_tolerance,
3182 mj_part_t current_work_part,
3183 mj_part_t current_concurrent_num_parts,
3184 Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
3185 mj_part_t total_incomplete_cut_count,
3186 Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count,
3187 Kokkos::View<size_t*, device_t> & view_total_reduction_size)
3189 this->temp_cut_coords = current_cut_coordinates;
3192 *reductionOp = NULL;
3194 bool bSingleProcess = (this->comm->getSize() == 1);
3196 std::vector<mj_part_t> temp(host_num_partitioning_in_current_dim.size());
3197 if(!bSingleProcess) {
3198 for(
size_t n = 0; n < host_num_partitioning_in_current_dim.size(); ++n) {
3199 temp[n] = host_num_partitioning_in_current_dim(n);
3202 <mj_part_t, mj_scalar_t>(
3205 current_concurrent_num_parts);
3208 auto local_cut_lower_bound_coordinates =
3209 cut_lower_bound_coordinates;
3210 auto local_cut_upper_bound_coordinates =
3211 cut_upper_bound_coordinates;
3212 auto local_cut_upper_bound_weights = cut_upper_bound_weights;
3213 auto local_cut_lower_bound_weights = cut_lower_bound_weights;
3214 bool local_distribute_points_on_cut_lines = distribute_points_on_cut_lines;
3215 auto local_process_cut_line_weight_to_put_left =
3216 process_cut_line_weight_to_put_left;
3217 auto local_temp_cut_coords = temp_cut_coords;
3218 auto local_global_total_part_weight_left_right_closests =
3219 global_total_part_weight_left_right_closests;
3220 auto local_cut_coordinates_work_array =
3221 cut_coordinates_work_array;
3222 auto local_part_xadj = part_xadj;
3223 auto local_global_min_max_coord_total_weight =
3224 global_min_max_coord_total_weight;
3225 auto local_target_part_weights =
3226 target_part_weights;
3227 auto local_global_rectilinear_cut_weight =
3228 global_rectilinear_cut_weight;
3229 auto local_process_rectilinear_cut_weight =
3230 process_rectilinear_cut_weight;
3232 auto local_is_cut_line_determined = this->is_cut_line_determined;
3233 auto local_device_num_partitioning_in_current_dim =
3234 device_num_partitioning_in_current_dim;
3236 Kokkos::parallel_for(
3237 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
3238 KOKKOS_LAMBDA (
int dummy) {
3241 view_rectilinear_cut_count(0) = 0;
3242 view_total_reduction_size(0) = 0;
3246 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i) {
3247 mj_part_t num_part_in_dim =
3248 local_device_num_partitioning_in_current_dim(current_work_part + i);
3249 mj_part_t num_cut_in_dim = num_part_in_dim - 1;
3250 view_total_reduction_size(0) += (4 * num_cut_in_dim + 1);
3252 for(mj_part_t ii = 0; ii < num_cut_in_dim; ++ii) {
3253 local_is_cut_line_determined(next) =
false;
3255 local_cut_lower_bound_coordinates(next) =
3256 local_global_min_max_coord_total_weight(i);
3258 local_cut_upper_bound_coordinates(next) =
3259 local_global_min_max_coord_total_weight(
3260 i + current_concurrent_num_parts);
3262 local_cut_upper_bound_weights(next) =
3263 local_global_min_max_coord_total_weight(
3264 i + 2 * current_concurrent_num_parts);
3265 local_cut_lower_bound_weights(next) = 0;
3266 if(local_distribute_points_on_cut_lines) {
3267 local_process_cut_line_weight_to_put_left(next) = 0;
3278 while (total_incomplete_cut_count != 0) {
3279 this->mj_1D_part_get_part_weights(
3280 current_concurrent_num_parts,
3282 mj_current_dim_coords,
3286 this->mj_combine_rightleft_and_weights(
3288 current_concurrent_num_parts);
3291 if(!bSingleProcess) {
3294 auto host_total_part_weight_left_right_closests =
3295 Kokkos::create_mirror_view(Kokkos::HostSpace(),
3296 total_part_weight_left_right_closests);
3297 auto host_global_total_part_weight_left_right_closests =
3298 Kokkos::create_mirror_view(Kokkos::HostSpace(),
3299 global_total_part_weight_left_right_closests);
3301 Kokkos::deep_copy(host_total_part_weight_left_right_closests,
3302 total_part_weight_left_right_closests);
3304 size_t host_view_total_reduction_size;
3305 Kokkos::parallel_reduce(
"Read single",
3306 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
3307 KOKKOS_LAMBDA(
int dummy,
size_t & set_single) {
3308 set_single = view_total_reduction_size(0);
3309 }, host_view_total_reduction_size);
3311 reduceAll<int, mj_scalar_t>( *(this->comm), *reductionOp,
3312 host_view_total_reduction_size,
3313 host_total_part_weight_left_right_closests.data(),
3314 host_global_total_part_weight_left_right_closests.data());
3315 Kokkos::deep_copy(global_total_part_weight_left_right_closests,
3316 host_global_total_part_weight_left_right_closests);
3319 local_global_total_part_weight_left_right_closests =
3320 this->total_part_weight_left_right_closests;
3325 mj_part_t cut_shift = 0;
3329 size_t tlr_shift = 0;
3331 Kokkos::View<mj_part_t*, Kokkos::HostSpace>
3332 save_initial_incomplete_cut_count(
"save_initial_incomplete_cut_count",
3333 current_concurrent_num_parts);
3335 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
3337 mj_part_t num_parts =
3338 host_num_partitioning_in_current_dim(current_work_part + kk);
3340 mj_part_t num_cuts = num_parts - 1;
3341 size_t num_total_part = num_parts + size_t (num_cuts);
3346 mj_part_t kk_incomplete_cut_count = this->incomplete_cut_count(kk);
3348 if(kk_incomplete_cut_count == 0) {
3349 cut_shift += num_cuts;
3350 tlr_shift += (num_total_part + 2 * num_cuts);
3354 Kokkos::View<mj_scalar_t *, device_t> current_local_part_weights =
3355 Kokkos::subview(this->total_part_weight_left_right_closests,
3356 std::pair<mj_lno_t, mj_lno_t>(
3358 this->total_part_weight_left_right_closests.size()));
3360 Kokkos::View<mj_scalar_t *, device_t> current_global_tlr =
3362 local_global_total_part_weight_left_right_closests,
3363 std::pair<mj_lno_t, mj_lno_t>(
3365 local_global_total_part_weight_left_right_closests.size()));
3366 Kokkos::View<mj_scalar_t *, device_t>
3367 current_global_left_closest_points =
3368 Kokkos::subview(current_global_tlr,
3369 std::pair<mj_lno_t, mj_lno_t>(
3371 current_global_tlr.size()));
3372 Kokkos::View<mj_scalar_t *, device_t>
3373 current_global_right_closest_points =
3374 Kokkos::subview(current_global_tlr,
3375 std::pair<mj_lno_t, mj_lno_t>(
3376 num_total_part + num_cuts,
3377 current_global_tlr.size()));
3378 Kokkos::View<mj_scalar_t *, device_t> current_global_part_weights =
3381 Kokkos::View<bool *, device_t> current_cut_line_determined =
3382 Kokkos::subview(this->is_cut_line_determined,
3383 std::pair<mj_lno_t, mj_lno_t>(
3385 this->is_cut_line_determined.size()));
3386 Kokkos::View<mj_scalar_t *, device_t> current_part_target_weights =
3387 Kokkos::subview(local_target_part_weights,
3388 std::pair<mj_lno_t, mj_lno_t>(
3390 local_target_part_weights.size()));
3391 Kokkos::View<mj_scalar_t *, device_t>
3392 current_part_cut_line_weight_to_put_left =
3393 Kokkos::subview(local_process_cut_line_weight_to_put_left,
3394 std::pair<mj_lno_t, mj_lno_t>(
3396 local_process_cut_line_weight_to_put_left.size()));
3398 save_initial_incomplete_cut_count(kk) =
3399 kk_incomplete_cut_count;
3401 Kokkos::View<mj_scalar_t *, device_t>
3402 current_cut_lower_bound_weights =
3403 Kokkos::subview(local_cut_lower_bound_weights,
3404 std::pair<mj_lno_t, mj_lno_t>(
3406 local_cut_lower_bound_weights.size()));
3407 Kokkos::View<mj_scalar_t *, device_t> current_cut_upper_weights =
3408 Kokkos::subview(local_cut_upper_bound_weights,
3409 std::pair<mj_lno_t, mj_lno_t>(
3411 local_cut_upper_bound_weights.size()));
3412 Kokkos::View<mj_scalar_t *, device_t> current_cut_upper_bounds =
3413 Kokkos::subview(local_cut_upper_bound_coordinates,
3414 std::pair<mj_lno_t, mj_lno_t>(
3416 local_cut_upper_bound_coordinates.size()));
3417 Kokkos::View<mj_scalar_t *, device_t> current_cut_lower_bounds =
3418 Kokkos::subview(local_cut_lower_bound_coordinates,
3419 std::pair<mj_lno_t, mj_lno_t>(
3421 local_cut_lower_bound_coordinates.size()));
3424 Kokkos::View<mj_scalar_t*, device_t> sub_temp_cut_coords =
3425 Kokkos::subview(this->temp_cut_coords,
3426 std::pair<mj_lno_t, mj_lno_t>(
3427 cut_shift, this->temp_cut_coords.size()));
3428 Kokkos::View<mj_scalar_t*, device_t> sub_cut_coordinates_work_array =
3429 Kokkos::subview(this->cut_coordinates_work_array,
3430 std::pair<mj_lno_t, mj_lno_t>(
3431 cut_shift, this->cut_coordinates_work_array.size()));
3433 this->mj_get_new_cut_coordinates(
3434 current_concurrent_num_parts,
3437 used_imbalance_tolerance,
3438 current_global_part_weights,
3439 current_local_part_weights,
3440 current_part_target_weights,
3441 current_cut_line_determined,
3442 sub_temp_cut_coords,
3443 current_cut_upper_bounds,
3444 current_cut_lower_bounds,
3445 current_global_left_closest_points,
3446 current_global_right_closest_points,
3447 current_cut_lower_bound_weights,
3448 current_cut_upper_weights,
3449 sub_cut_coordinates_work_array,
3450 current_part_cut_line_weight_to_put_left,
3451 view_rectilinear_cut_count);
3453 cut_shift += num_cuts;
3454 tlr_shift += (num_total_part + 2 * num_cuts);
3457 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
3458 mj_part_t iteration_complete_cut_count =
3459 save_initial_incomplete_cut_count(kk) - this->incomplete_cut_count(kk);
3460 total_incomplete_cut_count -= iteration_complete_cut_count;
3463 Kokkos::parallel_for(
3464 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
3465 (0, local_temp_cut_coords.size()), KOKKOS_LAMBDA(
int n) {
3466 auto t = local_temp_cut_coords(n);
3467 local_temp_cut_coords(n) = local_cut_coordinates_work_array(n);
3468 local_cut_coordinates_work_array(n) = t;
3476 if(current_cut_coordinates != local_temp_cut_coords) {
3477 Kokkos::parallel_for(
3478 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
3479 (0, 1), KOKKOS_LAMBDA(
int dummy) {
3481 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i) {
3482 mj_part_t num_parts = -1;
3483 num_parts = local_device_num_partitioning_in_current_dim(
3484 current_work_part + i);
3485 mj_part_t num_cuts = num_parts - 1;
3486 for(mj_part_t ii = 0; ii < num_cuts; ++ii) {
3487 current_cut_coordinates(next + ii) = local_temp_cut_coords(next + ii);
3492 static_cast<int>(local_cut_coordinates_work_array.size()); ++n) {
3493 local_cut_coordinates_work_array(n) = local_temp_cut_coords(n);
3501 template<
class scalar_t>
3507 KOKKOS_INLINE_FUNCTION
3510 KOKKOS_INLINE_FUNCTION
3514 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
3516 template<
class policy_t,
class scalar_t,
class part_t>
3527 scalar_t mj_max_scalar,
3529 int mj_value_count_rightleft,
3530 int mj_value_count_weights) :
3537 KOKKOS_INLINE_FUNCTION
3542 KOKKOS_INLINE_FUNCTION
3545 dst.
ptr[n] += src.
ptr[n];
3548 for(
int n = value_count_weights + 2;
3550 if(src.
ptr[n] > dst.
ptr[n]) {
3553 if(src.
ptr[n+1] < dst.
ptr[n+1]) {
3554 dst.
ptr[n+1] = src.
ptr[n+1];
3559 KOKKOS_INLINE_FUNCTION
3562 dst.
ptr[n] += src.
ptr[n];
3565 for(
int n = value_count_weights + 2;
3567 if(src.
ptr[n] > dst.
ptr[n]) {
3570 if(src.
ptr[n+1] < dst.
ptr[n+1]) {
3571 dst.
ptr[n+1] = src.
ptr[n+1];
3583 for(
int n = value_count_weights;
3590 #endif // KOKKOS_ENABLE_CUDA && KOKKOS_ENABLE_HIP
3592 template<
class policy_t,
class scalar_t,
class part_t,
class index_t,
3598 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
3621 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3622 Kokkos::View<double *, device_t> current_part_weights;
3623 Kokkos::View<scalar_t *, device_t> current_left_closest;
3624 Kokkos::View<scalar_t *, device_t> current_right_closest;
3625 #endif // KOKKOS_ENABLE_CUDA || defined(KOKKOS_ENABLE_HIP)
3629 array_t mj_max_scalar,
3630 part_t mj_concurrent_current_part,
3632 part_t mj_current_work_part,
3633 part_t mj_current_concurrent_num_parts,
3634 part_t mj_left_right_array_size,
3635 part_t mj_weight_array_size,
3636 Kokkos::View<index_t*, device_t> & mj_permutations,
3637 Kokkos::View<scalar_t *, device_t> & mj_coordinates,
3638 Kokkos::View<scalar_t**, device_t> & mj_weights,
3639 Kokkos::View<part_t*, device_t> & mj_parts,
3640 Kokkos::View<scalar_t *, device_t> & mj_cut_coordinates,
3641 Kokkos::View<index_t *, device_t> & mj_part_xadj,
3642 bool mj_uniform_weights0,
3643 scalar_t mj_sEpsilon
3644 #
if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3645 ,Kokkos::View<double *, device_t> & mj_current_part_weights,
3646 Kokkos::View<scalar_t *, device_t> & mj_current_left_closest,
3647 Kokkos::View<scalar_t *, device_t> & mj_current_right_closest
3650 loop_count(mj_loop_count),
3652 concurrent_current_part(mj_concurrent_current_part),
3653 num_cuts(mj_num_cuts),
3654 current_work_part(mj_current_work_part),
3655 current_concurrent_num_parts(mj_current_concurrent_num_parts),
3658 value_count(mj_weight_array_size+mj_left_right_array_size),
3667 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3668 ,current_part_weights(mj_current_part_weights),
3669 current_left_closest(mj_current_left_closest),
3670 current_right_closest(mj_current_right_closest)
3676 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3677 int result =
sizeof(array_t) *
3680 int result =
sizeof(array_t) *
3685 int remainder = result % 8;
3686 if(remainder != 0) {
3687 result += 8 - remainder;
3692 KOKKOS_INLINE_FUNCTION
3693 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3699 index_t all_begin = (concurrent_current_part == 0) ? 0 :
3701 index_t all_end =
part_xadj(concurrent_current_part);
3703 index_t num_working_points = all_end - all_begin;
3704 int num_teams = teamMember.league_size();
3706 index_t stride = num_working_points / num_teams;
3707 if((num_working_points % num_teams) > 0) {
3714 index_t begin = all_begin + stride * teamMember.league_rank();
3715 index_t end = begin + stride;
3720 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3724 array_t * shared_ptr = (array_t *) teamMember.team_shmem().get_shmem(
3728 Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
3732 for(
int n = value_count_weights;
3738 teamMember.team_barrier();
3740 Kokkos::parallel_for(
3741 Kokkos::TeamThreadRange(teamMember, begin, end),
3743 #else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3748 array_t * shared_ptr = (array_t *) teamMember.team_shmem().get_shmem(
3761 Kokkos::parallel_reduce(
3762 Kokkos::TeamThreadRange(teamMember, begin, end),
3763 #
if (__cplusplus > 201703L)
3768 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3775 index_t part =
parts(i)/2;
3786 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3787 Kokkos::atomic_add(&shared_ptr[part*2], w);
3789 threadSum.
ptr[part*2] += w;
3795 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3796 array_t new_value = (array_t) coord;
3798 while(new_value < prev_value) {
3799 prev_value = Kokkos::atomic_compare_exchange(
3801 prev_value, new_value);
3804 while(new_value > prev_value) {
3805 prev_value = Kokkos::atomic_compare_exchange(
3807 prev_value, new_value);
3824 else if(part != num_cuts) {
3825 if(coord < b + sEpsilon && coord > b -
sEpsilon) {
3829 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3830 Kokkos::atomic_add(&shared_ptr[part*2+1], w);
3834 threadSum.
ptr[part*2+1] += w;
3839 parts(i) = part*2+1;
3845 part_t base_b = part;
3848 while(part < num_cuts) {
3850 scalar_t delta = b - base_coord;
3851 if(delta < 0) delta = -delta;
3856 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3857 Kokkos::atomic_add(&shared_ptr[part*2+1], w);
3861 threadSum.
ptr[part*2+1] += w;
3872 scalar_t delta = b - base_coord;
3873 if(delta < 0) delta = -delta;
3878 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3879 Kokkos::atomic_add(&shared_ptr[part*2+1], w);
3883 threadSum.
ptr[part*2+1] += w;
3896 if(loop_count != 0) {
3908 if(part == lower + 1) {
3913 part -= (part - lower)/2;
3916 else if(part == upper - 1) {
3921 part += (upper - part)/2;
3925 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3927 #else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3928 }, arraySumReducer);
3929 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3931 teamMember.team_barrier();
3934 #if (__cplusplus > 201703L)
3935 Kokkos::single(Kokkos::PerTeam(teamMember), [=,
this] () {
3937 Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
3940 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3941 Kokkos::atomic_add(¤t_part_weights(n),
3942 static_cast<double>(shared_ptr[n]));
3943 #else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3944 teamSum[n] += array.
ptr[n];
3945 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3948 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3949 int insert_left = 0;
3950 int insert_right = 0;
3953 for(
int n = 2 + value_count_weights;
3955 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3956 scalar_t new_value = shared_ptr[n+1];
3957 scalar_t prev_value = current_right_closest(insert_right);
3958 while(new_value < prev_value) {
3959 prev_value = Kokkos::atomic_compare_exchange(
3960 ¤t_right_closest(insert_right), prev_value, new_value);
3963 new_value = shared_ptr[n];
3964 prev_value = current_left_closest(insert_left);
3965 while(new_value > prev_value) {
3966 prev_value = Kokkos::atomic_compare_exchange(
3967 ¤t_left_closest(insert_left), prev_value, new_value);
3972 #else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3973 if(array.
ptr[n] > teamSum[n]) {
3974 teamSum[n] = array.
ptr[n];
3976 if(array.
ptr[n+1] < teamSum[n+1]) {
3977 teamSum[n+1] = array.
ptr[n+1];
3979 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3983 teamMember.team_barrier();
3986 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
3987 KOKKOS_INLINE_FUNCTION
3993 for(
int n = value_count_weights + 2;
3995 if(src[n] > dst[n]) {
3998 if(src[n+1] < dst[n+1]) {
3999 dst[n+1] = src[n+1];
4009 for(
int n = value_count_weights;
4015 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4025 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4026 typename mj_part_t,
typename mj_node_t>
4027 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t,mj_part_t, mj_node_t>::
4028 mj_1D_part_get_part_weights(
4029 mj_part_t current_concurrent_num_parts,
4030 mj_part_t current_work_part,
4031 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
4034 auto local_is_cut_line_determined = is_cut_line_determined;
4035 auto local_thread_part_weights = thread_part_weights;
4036 auto local_thread_cut_left_closest_point = thread_cut_left_closest_point;
4037 auto local_thread_cut_right_closest_point = thread_cut_right_closest_point;
4041 auto local_sEpsilon = this->
sEpsilon;
4042 auto local_assigned_part_ids = this->assigned_part_ids;
4043 auto local_coordinate_permutations = this->coordinate_permutations;
4044 auto local_mj_weights = this->mj_weights;
4046 auto local_global_min_max_coord_total_weight =
4047 this->global_min_max_coord_total_weight;
4049 typedef Kokkos::TeamPolicy<typename mj_node_t::execution_space> policy_t;
4051 auto local_device_num_partitioning_in_current_dim =
4052 device_num_partitioning_in_current_dim;
4054 Kokkos::deep_copy(device_incomplete_cut_count, this->incomplete_cut_count);
4055 auto local_device_incomplete_cut_count = device_incomplete_cut_count;
4057 mj_part_t total_part_shift = 0;
4059 mj_part_t concurrent_cut_shifts = 0;
4061 Kokkos::View<mj_scalar_t *, device_t> local_temp_cut_coords =
4062 Kokkos::subview(temp_cut_coords, std::pair<mj_lno_t, mj_lno_t>(
4063 concurrent_cut_shifts, temp_cut_coords.size()));
4065 mj_part_t num_parts =
4066 host_num_partitioning_in_current_dim(current_work_part + kk);
4067 mj_part_t num_cuts = num_parts - 1;
4068 mj_part_t total_part_count = num_parts +
num_cuts;
4069 mj_part_t weight_array_length = num_cuts + num_parts;
4072 mj_part_t right_left_array_length = (num_cuts + 2) * 2;
4074 if(this->incomplete_cut_count(kk) == 0) {
4075 total_part_shift += total_part_count;
4081 auto policy_ReduceWeightsFunctor = policy_t(
4082 mj_num_teams ? mj_num_teams : 60, Kokkos::AUTO);
4084 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4085 int total_array_length =
4086 weight_array_length + right_left_array_length;
4093 typedef mj_scalar_t array_t;
4095 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4096 Kokkos::View<array_t*, Kokkos::HostSpace> reduce_array(
"reduce_array", total_array_length);
4097 #endif // KOKKOS_ENABLE_CUDA && KOKKOS_ENABLE_HIP
4099 int offset_cuts = 0;
4100 for(
int kk2 = 0; kk2 < kk; ++kk2) {
4102 host_num_partitioning_in_current_dim(current_work_part + kk2) - 1;
4104 Kokkos::View<double *, device_t> my_current_part_weights =
4105 Kokkos::subview(local_thread_part_weights,
4106 std::pair<mj_lno_t, mj_lno_t>(total_part_shift,
4107 total_part_shift + total_part_count));
4108 Kokkos::View<mj_scalar_t *, device_t> my_current_left_closest =
4109 Kokkos::subview(local_thread_cut_left_closest_point,
4110 std::pair<mj_lno_t, mj_lno_t>(
4112 local_thread_cut_left_closest_point.size()));
4113 Kokkos::View<mj_scalar_t *, device_t> my_current_right_closest =
4114 Kokkos::subview(local_thread_cut_right_closest_point,
4115 std::pair<mj_lno_t, mj_lno_t>(
4117 local_thread_cut_right_closest_point.size()));
4119 array_t
max_scalar = std::numeric_limits<array_t>::max();
4121 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4123 Kokkos::parallel_for(
4124 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4125 KOKKOS_LAMBDA (
int dummy) {
4126 for(
int n = 0; n < weight_array_length; ++n) {
4127 my_current_part_weights(n) = 0;
4129 for(
int n = 0; n <
num_cuts; ++n) {
4136 mj_part_t concurrent_current_part =
4137 current_work_part + kk;
4140 typename mj_node_t::device_type, array_t>
4144 concurrent_current_part,
4147 current_concurrent_num_parts,
4148 right_left_array_length,
4149 weight_array_length,
4150 coordinate_permutations,
4151 mj_current_dim_coords,
4154 local_temp_cut_coords,
4156 mj_uniform_weights(0),
4158 #
if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4159 ,my_current_part_weights,
4160 my_current_left_closest,
4161 my_current_right_closest
4165 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4166 Kokkos::parallel_for(policy_ReduceWeightsFunctor, teamFunctor);
4168 Kokkos::parallel_reduce(policy_ReduceWeightsFunctor,
4169 teamFunctor, reduce_array);
4173 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4174 auto hostArray = Kokkos::create_mirror_view(my_current_part_weights);
4176 for(
int i = 0; i < static_cast<int>(total_part_count); ++i) {
4177 hostArray(i) = reduce_array[i];
4180 Kokkos::deep_copy(my_current_part_weights, hostArray);
4182 auto hostLeftArray = Kokkos::create_mirror_view(my_current_left_closest);
4183 auto hostRightArray = Kokkos::create_mirror_view(my_current_right_closest);
4184 for(mj_part_t cut = 0; cut <
num_cuts; ++cut) {
4185 hostLeftArray(cut) = reduce_array[weight_array_length + (cut+1)*2+0];
4186 hostRightArray(cut) = reduce_array[weight_array_length + (cut+1)*2+1];
4188 Kokkos::deep_copy(my_current_left_closest, hostLeftArray);
4189 Kokkos::deep_copy(my_current_right_closest, hostRightArray);
4192 total_part_shift += total_part_count;
4196 auto local_temp_cut_coords = temp_cut_coords;
4198 Kokkos::parallel_for(
4199 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
4200 (0, current_concurrent_num_parts), KOKKOS_LAMBDA(mj_part_t kk) {
4201 mj_part_t num_parts = local_device_num_partitioning_in_current_dim(
4202 current_work_part + kk);
4203 mj_part_t num_cuts = num_parts - 1;
4204 mj_part_t total_part_count = num_parts +
num_cuts;
4206 if(local_device_incomplete_cut_count(kk) > 0) {
4210 size_t offset_cuts = 0;
4211 for(mj_part_t kk2 = 0; kk2 < kk; ++kk2) {
4212 auto num_parts_kk2 = local_device_num_partitioning_in_current_dim(
4213 current_work_part + kk2);
4214 offset += num_parts_kk2 * 2 - 1;
4215 offset_cuts += num_parts_kk2 - 1;
4218 for(mj_part_t i = 1; i < total_part_count; ++i) {
4222 if(i % 2 == 0 && i > 1 && i < total_part_count - 1 &&
4223 std::abs(local_temp_cut_coords(offset_cuts + i / 2) -
4224 local_temp_cut_coords(offset_cuts + i /2 - 1))
4229 local_thread_part_weights(offset + i)
4230 = local_thread_part_weights(offset + i-2);
4235 local_thread_part_weights(offset + i) +=
4236 local_thread_part_weights(offset + i-1);
4249 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4250 typename mj_part_t,
typename mj_node_t>
4251 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
4252 mj_combine_rightleft_and_weights(
4253 mj_part_t current_work_part,
4254 mj_part_t current_concurrent_num_parts)
4256 auto local_thread_part_weights = this->thread_part_weights;
4257 auto local_is_cut_line_determined = this->is_cut_line_determined;
4258 auto local_thread_cut_left_closest_point =
4259 this->thread_cut_left_closest_point;
4260 auto local_thread_cut_right_closest_point =
4261 this->thread_cut_right_closest_point;
4262 auto local_total_part_weight_left_right_closests =
4263 this->total_part_weight_left_right_closests;
4264 auto local_device_num_partitioning_in_current_dim =
4265 device_num_partitioning_in_current_dim;
4266 Kokkos::parallel_for(
4267 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>(0,1),
4268 KOKKOS_LAMBDA (
int dummy) {
4270 size_t tlr_array_shift = 0;
4271 mj_part_t cut_shift = 0;
4272 size_t total_part_array_shift = 0;
4278 mj_part_t num_parts_in_part =
4279 local_device_num_partitioning_in_current_dim(current_work_part + i);
4280 mj_part_t num_cuts_in_part = num_parts_in_part - 1;
4281 size_t num_total_part_in_part =
4282 num_parts_in_part + size_t (num_cuts_in_part);
4285 for(
int ii = 0; ii < num_cuts_in_part; ++ii) {
4286 mj_part_t next = tlr_array_shift + ii;
4287 mj_part_t cut_index = cut_shift + ii;
4289 if(!local_is_cut_line_determined(cut_index)) {
4290 mj_scalar_t left_closest_in_process =
4291 local_thread_cut_left_closest_point(cut_index);
4292 mj_scalar_t right_closest_in_process =
4293 local_thread_cut_right_closest_point(cut_index);
4296 local_total_part_weight_left_right_closests(
4297 num_total_part_in_part + next) = left_closest_in_process;
4299 local_total_part_weight_left_right_closests(
4300 num_total_part_in_part + num_cuts_in_part + next) =
4301 right_closest_in_process;
4305 for(
size_t j = 0; j < num_total_part_in_part; ++j) {
4306 mj_part_t cut_ind = j / 2 + cut_shift;
4312 if(j == num_total_part_in_part - 1 ||
4313 !local_is_cut_line_determined(cut_ind)) {
4314 double pwj = local_thread_part_weights(total_part_array_shift + j);
4315 local_total_part_weight_left_right_closests(tlr_array_shift + j) = pwj;
4320 cut_shift += num_cuts_in_part;
4321 tlr_array_shift += num_total_part_in_part + 2 * num_cuts_in_part;
4322 total_part_array_shift += num_total_part_in_part;
4339 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4340 typename mj_part_t,
typename mj_node_t>
4341 KOKKOS_INLINE_FUNCTION
4342 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
4343 mj_node_t>::mj_calculate_new_cut_position(mj_scalar_t cut_upper_bound,
4344 mj_scalar_t cut_lower_bound,
4345 mj_scalar_t cut_upper_weight,
4346 mj_scalar_t cut_lower_weight,
4347 mj_scalar_t expected_weight,
4348 mj_scalar_t &new_cut_position,
4351 if(std::abs(cut_upper_bound - cut_lower_bound) < sEpsilon) {
4352 new_cut_position = cut_upper_bound;
4355 if(std::abs(cut_upper_weight - cut_lower_weight) < sEpsilon) {
4356 new_cut_position = cut_lower_bound;
4359 mj_scalar_t coordinate_range = (cut_upper_bound - cut_lower_bound);
4360 mj_scalar_t weight_range = (cut_upper_weight - cut_lower_weight);
4361 mj_scalar_t my_weight_diff = (expected_weight - cut_lower_weight);
4363 mj_scalar_t required_shift = (my_weight_diff / weight_range);
4364 int scale_constant = 20;
4365 int shiftint= int (required_shift * scale_constant);
4366 if(shiftint == 0) shiftint = 1;
4367 required_shift = mj_scalar_t (shiftint) / scale_constant;
4368 new_cut_position = coordinate_range * required_shift + cut_lower_bound;
4371 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4373 template<
class policy_t,
class scalar_t>
4383 int mj_value_count) :
4388 KOKKOS_INLINE_FUNCTION
4393 KOKKOS_INLINE_FUNCTION
4396 dst.
ptr[n] += src.
ptr[n];
4401 dst.
ptr = value->ptr;
4410 template<
class policy_t,
class scalar_t,
class part_t,
class index_t,
4416 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4428 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4429 Kokkos::View<int *, device_t> local_point_counts;
4430 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4433 part_t mj_concurrent_current_part,
4434 part_t mj_weight_array_size,
4435 Kokkos::View<index_t*, device_t> & mj_permutations,
4436 Kokkos::View<scalar_t *, device_t> & mj_coordinates,
4437 Kokkos::View<part_t*, device_t> & mj_parts,
4438 Kokkos::View<index_t *, device_t> & mj_part_xadj,
4439 Kokkos::View<index_t *, device_t> & mj_track_on_cuts
4440 #
if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4441 ,Kokkos::View<int *, device_t> & mj_local_point_counts
4444 concurrent_current_part(mj_concurrent_current_part),
4450 track_on_cuts(mj_track_on_cuts)
4451 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4452 ,local_point_counts(mj_local_point_counts)
4458 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4461 int result =
sizeof(array_t) * (
value_count) * team_size;
4465 int remainder = result % 8;
4466 if(remainder != 0) {
4467 result += 8 - remainder;
4472 KOKKOS_INLINE_FUNCTION
4473 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4478 index_t all_begin = (concurrent_current_part == 0) ? 0 :
4480 index_t all_end =
part_xadj(concurrent_current_part);
4482 index_t num_working_points = all_end - all_begin;
4483 int num_teams = teamMember.league_size();
4485 index_t stride = num_working_points / num_teams;
4486 if((num_working_points % num_teams) > 0) {
4490 index_t begin = all_begin + stride * teamMember.league_rank();
4491 index_t end = begin + stride;
4496 int track_on_cuts_insert_index = track_on_cuts.size() - 1;
4499 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4500 size_t sh_mem_size =
sizeof(array_t) * (
value_count);
4502 size_t sh_mem_size =
4503 sizeof(array_t) * (
value_count) * teamMember.team_size();
4506 array_t * shared_ptr = (array_t *) teamMember.team_shmem().get_shmem(
4509 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4511 Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
4516 teamMember.team_barrier();
4518 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, begin, end),
4520 #else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4528 Kokkos::parallel_reduce(
4529 Kokkos::TeamThreadRange(teamMember, begin, end),
4530 #
if (__cplusplus > 201703L)
4535 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4538 part_t place =
parts(coordinate_index);
4539 part_t part = place / 2;
4540 if(place % 2 == 0) {
4541 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4542 Kokkos::atomic_add(&shared_ptr[part], 1);
4544 threadSum.
ptr[part] += 1;
4547 parts(coordinate_index) = part;
4552 index_t set_index = Kokkos::atomic_fetch_add(
4553 &track_on_cuts(track_on_cuts_insert_index), 1);
4554 track_on_cuts(set_index) = ii;
4556 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4558 #else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4560 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4562 teamMember.team_barrier();
4565 #if (__cplusplus > 201703L)
4566 Kokkos::single(Kokkos::PerTeam(teamMember), [=,
this] () {
4568 Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
4571 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4572 Kokkos::atomic_add(&local_point_counts(n), shared_ptr[n]);
4573 #else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4574 teamSum[n] += array.
ptr[n];
4575 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4579 teamMember.team_barrier();
4582 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4584 KOKKOS_INLINE_FUNCTION
4614 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4615 typename mj_part_t,
typename mj_node_t>
4616 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
4617 mj_create_new_partitions(
4618 mj_part_t num_parts,
4619 mj_part_t current_concurrent_work_part,
4620 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
4621 Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
4622 Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
4623 Kokkos::View<mj_lno_t *, device_t> & out_part_xadj)
4626 auto local_thread_part_weight_work = this->thread_part_weight_work;
4627 auto local_point_counts = this->thread_point_counts;
4628 auto local_distribute_points_on_cut_lines =
4629 this->distribute_points_on_cut_lines;
4630 auto local_thread_cut_line_weight_to_put_left =
4631 this->thread_cut_line_weight_to_put_left;
4632 auto local_sEpsilon = this->
sEpsilon;
4633 auto local_coordinate_permutations = this->coordinate_permutations;
4634 auto local_mj_weights = this->mj_weights;
4635 auto local_assigned_part_ids = this->assigned_part_ids;
4636 auto local_new_coordinate_permutations = this->new_coordinate_permutations;
4638 mj_part_t num_cuts = num_parts - 1;
4640 Kokkos::parallel_for(
4641 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4642 KOKKOS_LAMBDA(
int dummy) {
4644 if(local_distribute_points_on_cut_lines) {
4645 for(
int i = 0; i <
num_cuts; ++i) {
4646 mj_scalar_t left_weight = used_local_cut_line_weight_to_left(i);
4647 if(left_weight > local_sEpsilon) {
4649 mj_scalar_t thread_ii_weight_on_cut =
4650 local_thread_part_weight_work(i * 2 + 1) -
4651 local_thread_part_weight_work(i * 2);
4653 if(thread_ii_weight_on_cut < left_weight) {
4655 local_thread_cut_line_weight_to_put_left(i) =
4656 thread_ii_weight_on_cut;
4660 local_thread_cut_line_weight_to_put_left(i) = left_weight;
4662 left_weight -= thread_ii_weight_on_cut;
4665 local_thread_cut_line_weight_to_put_left(i) = 0;
4671 for(mj_part_t i = num_cuts - 1; i > 0 ; --i) {
4672 if(std::abs(current_concurrent_cut_coordinate(i) -
4673 current_concurrent_cut_coordinate(i -1)) < local_sEpsilon) {
4674 local_thread_cut_line_weight_to_put_left(i) -=
4675 local_thread_cut_line_weight_to_put_left(i - 1);
4677 local_thread_cut_line_weight_to_put_left(i) =
4678 static_cast<long long>((local_thread_cut_line_weight_to_put_left(i) +
4679 least_signifiance) * significance_mul) /
4680 static_cast<mj_scalar_t
>(significance_mul);
4684 for(mj_part_t i = 0; i < num_parts; ++i) {
4685 local_point_counts(i) = 0;
4689 mj_lno_t coordinate_begin_index =
4690 current_concurrent_work_part == 0 ? 0 :
4691 host_part_xadj(current_concurrent_work_part - 1);
4692 mj_lno_t coordinate_end_index =
4693 host_part_xadj(current_concurrent_work_part);
4695 mj_lno_t total_on_cut;
4696 Kokkos::parallel_reduce(
"Get total_on_cut",
4697 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (
4698 coordinate_begin_index, coordinate_end_index),
4699 KOKKOS_LAMBDA(
int ii, mj_lno_t & val) {
4700 mj_lno_t coordinate_index = local_coordinate_permutations(ii);
4701 mj_part_t coordinate_assigned_place =
4702 local_assigned_part_ids(coordinate_index);
4703 if(coordinate_assigned_place % 2 == 1) {
4708 Kokkos::View<mj_lno_t *, device_t> track_on_cuts;
4709 if(total_on_cut > 0) {
4710 track_on_cuts = Kokkos::View<mj_lno_t *, device_t>(
4721 typedef Kokkos::TeamPolicy<typename mj_node_t::execution_space> policy_t;
4724 int use_num_teams = mj_num_teams ? mj_num_teams : 60;
4726 auto policy_ReduceFunctor = policy_t(use_num_teams, Kokkos::AUTO);
4727 typedef int array_t;
4731 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4732 Kokkos::View<array_t*, Kokkos::HostSpace> reduce_array(
"reduce_array", num_parts);
4735 ReduceArrayFunctor<policy_t, mj_scalar_t, mj_part_t, mj_lno_t,
4736 typename mj_node_t::device_type, array_t>teamFunctor(
4737 current_concurrent_work_part,
4739 coordinate_permutations,
4740 mj_current_dim_coords,
4744 #
if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4749 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4750 Kokkos::parallel_for(policy_ReduceFunctor, teamFunctor);
4752 Kokkos::parallel_reduce(policy_ReduceFunctor, teamFunctor, reduce_array);
4756 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4757 for(mj_part_t part = 0; part < num_parts; ++part) {
4758 local_point_counts(part) = reduce_array[part];
4764 if(track_on_cuts.size() > 0) {
4765 auto track_on_cuts_sort = Kokkos::subview(track_on_cuts,
4766 std::pair<mj_lno_t, mj_lno_t>(0, track_on_cuts.size() - 1));
4767 Kokkos::sort(track_on_cuts_sort);
4771 Kokkos::parallel_for(
4772 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4773 KOKKOS_LAMBDA (
int dummy) {
4775 for(
int j = 0; j < total_on_cut; ++j) {
4776 int ii = track_on_cuts(j);
4777 mj_lno_t coordinate_index = local_coordinate_permutations(ii);
4778 mj_scalar_t coordinate_weight = uniform_weights0 ? 1 :
4779 local_mj_weights(coordinate_index,0);
4780 mj_part_t coordinate_assigned_place =
4781 local_assigned_part_ids(coordinate_index);
4782 mj_part_t coordinate_assigned_part = coordinate_assigned_place / 2;
4784 if(local_distribute_points_on_cut_lines &&
4785 local_thread_cut_line_weight_to_put_left(
4786 coordinate_assigned_part) > local_sEpsilon) {
4790 local_thread_cut_line_weight_to_put_left(
4791 coordinate_assigned_part) -= coordinate_weight;
4796 if(local_thread_cut_line_weight_to_put_left(
4797 coordinate_assigned_part) < 0 && coordinate_assigned_part <
4799 std::abs(current_concurrent_cut_coordinate(
4800 coordinate_assigned_part+1) -
4801 current_concurrent_cut_coordinate(
4802 coordinate_assigned_part)) < local_sEpsilon)
4804 local_thread_cut_line_weight_to_put_left(
4805 coordinate_assigned_part + 1) +=
4806 local_thread_cut_line_weight_to_put_left(
4807 coordinate_assigned_part);
4809 ++local_point_counts(coordinate_assigned_part);
4810 local_assigned_part_ids(coordinate_index) =
4811 coordinate_assigned_part;
4816 ++coordinate_assigned_part;
4819 while(local_distribute_points_on_cut_lines &&
4820 coordinate_assigned_part < num_cuts)
4823 if(std::abs(current_concurrent_cut_coordinate(
4824 coordinate_assigned_part) -
4825 current_concurrent_cut_coordinate(
4826 coordinate_assigned_part - 1)) < local_sEpsilon)
4829 if(local_thread_cut_line_weight_to_put_left(
4830 coordinate_assigned_part) > local_sEpsilon &&
4831 local_thread_cut_line_weight_to_put_left(
4832 coordinate_assigned_part) >=
4833 std::abs(local_thread_cut_line_weight_to_put_left(
4834 coordinate_assigned_part) - coordinate_weight))
4836 local_thread_cut_line_weight_to_put_left(
4837 coordinate_assigned_part) -= coordinate_weight;
4841 if(local_thread_cut_line_weight_to_put_left(
4842 coordinate_assigned_part) < 0 &&
4843 coordinate_assigned_part < num_cuts - 1 &&
4844 std::abs(current_concurrent_cut_coordinate(
4845 coordinate_assigned_part+1) -
4846 current_concurrent_cut_coordinate(
4847 coordinate_assigned_part)) < local_sEpsilon)
4849 local_thread_cut_line_weight_to_put_left(
4850 coordinate_assigned_part + 1) +=
4851 local_thread_cut_line_weight_to_put_left(
4852 coordinate_assigned_part);
4860 ++coordinate_assigned_part;
4862 local_point_counts(coordinate_assigned_part) += 1;
4863 local_assigned_part_ids(coordinate_index) = coordinate_assigned_part;
4867 for(
int j = 0; j < num_parts; ++j) {
4868 out_part_xadj(j) = local_point_counts(j);
4869 local_point_counts(j) = 0;
4872 out_part_xadj(j) += out_part_xadj(j - 1);
4873 local_point_counts(j) += out_part_xadj(j - 1);
4881 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4886 Kokkos::parallel_for(
4887 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t> (
4888 coordinate_begin_index, coordinate_end_index),
4889 KOKKOS_LAMBDA (mj_lno_t ii) {
4890 mj_lno_t i = local_coordinate_permutations(ii);
4891 mj_part_t p = local_assigned_part_ids(i);
4892 mj_lno_t idx = Kokkos::atomic_fetch_add(&local_point_counts(p), 1);
4893 local_new_coordinate_permutations(coordinate_begin_index + idx) = i;
4898 #ifdef KOKKOS_ENABLE_OPENMP
4900 const int num_threads = 1;
4902 const int num_threads = 1;
4905 const int num_teams = 1;
4908 Kokkos::View<mj_lno_t*, device_t>
4909 point_counter(
"insert indices", num_teams * num_threads * num_parts);
4913 Kokkos::TeamPolicy<typename mj_node_t::execution_space>
4914 block_policy(num_teams, num_threads);
4915 typedef typename Kokkos::TeamPolicy<typename mj_node_t::execution_space>
::
4917 mj_lno_t range = coordinate_end_index - coordinate_begin_index;
4918 mj_lno_t block_size = range / num_teams + 1;
4919 Kokkos::parallel_for(block_policy, KOKKOS_LAMBDA(member_type team_member) {
4920 int team = team_member.league_rank();
4921 int team_offset = team * num_threads * num_parts;
4922 mj_lno_t begin = coordinate_begin_index + team * block_size;
4923 mj_lno_t end = begin + block_size;
4924 if(end > coordinate_end_index) {
4925 end = coordinate_end_index;
4928 Kokkos::parallel_for(Kokkos::TeamThreadRange(team_member, begin, end),
4930 int thread = team_member.team_rank();
4931 mj_lno_t i = local_coordinate_permutations(ii);
4932 mj_part_t p = local_assigned_part_ids(i);
4933 int index = team_offset + thread * num_parts + p;
4934 ++point_counter(index);
4942 Kokkos::parallel_for(
4943 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4944 KOKKOS_LAMBDA (
int dummy) {
4945 int num_sets = point_counter.size() / num_parts;
4946 for(
int set = num_sets - 1; set >= 1; set -=1) {
4947 int base = set * num_parts;
4948 for(
int part = 0; part < num_parts; ++part) {
4949 point_counter(base + part) = point_counter(base + part - num_parts);
4953 for(
int part = 0; part < num_parts; ++part) {
4954 point_counter(part) = 0;
4957 for(
int set = 1; set < num_sets; ++set) {
4958 int base = set * num_parts;
4959 for(
int part = 0; part < num_parts; ++part) {
4960 point_counter(base + part) += point_counter(base + part - num_parts);
4966 Kokkos::parallel_for(block_policy, KOKKOS_LAMBDA(member_type team_member) {
4967 int team = team_member.league_rank();
4968 int team_offset = team * num_threads * num_parts;
4969 mj_lno_t begin = coordinate_begin_index + team * block_size;
4970 mj_lno_t end = begin + block_size;
4971 if(end > coordinate_end_index) {
4972 end = coordinate_end_index;
4974 Kokkos::parallel_for(Kokkos::TeamThreadRange(team_member, begin, end),
4976 int thread = team_member.team_rank();
4977 mj_lno_t i = local_coordinate_permutations(ii);
4978 mj_part_t p = local_assigned_part_ids(i);
4979 int index = team_offset + thread * num_parts + p;
4980 int set_counter = (point_counter(index)++) + local_point_counts(p);
4981 local_new_coordinate_permutations(coordinate_begin_index + set_counter) = i;
5030 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5031 typename mj_part_t,
typename mj_node_t>
5032 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
5033 mj_node_t>::mj_get_new_cut_coordinates(
5034 mj_part_t current_concurrent_num_parts,
5036 const mj_part_t &num_cuts,
5037 const double &used_imbalance_tolerance,
5038 Kokkos::View<mj_scalar_t *, device_t> & current_global_part_weights,
5039 Kokkos::View<mj_scalar_t *, device_t> & current_local_part_weights,
5040 Kokkos::View<mj_scalar_t *, device_t> & current_part_target_weights,
5041 Kokkos::View<bool *, device_t> & current_cut_line_determined,
5042 Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
5043 Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_bounds,
5044 Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bounds,
5045 Kokkos::View<mj_scalar_t *, device_t> & current_global_left_closest_points,
5046 Kokkos::View<mj_scalar_t *, device_t> & current_global_right_closest_points,
5047 Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bound_weights,
5048 Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_weights,
5049 Kokkos::View<mj_scalar_t *, device_t> & new_current_cut_coordinates,
5050 Kokkos::View<mj_scalar_t *, device_t> &
5051 current_part_cut_line_weight_to_put_left,
5052 Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count)
5054 Kokkos::deep_copy(device_incomplete_cut_count, this->incomplete_cut_count);
5056 auto local_device_incomplete_cut_count = device_incomplete_cut_count;
5058 auto local_distribute_points_on_cut_lines = distribute_points_on_cut_lines;
5059 auto local_global_rectilinear_cut_weight = global_rectilinear_cut_weight;
5060 auto local_process_rectilinear_cut_weight = process_rectilinear_cut_weight;
5061 auto local_global_min_max_coord_total_weight =
5062 global_min_max_coord_total_weight;
5064 const auto _sEpsilon = this->
sEpsilon;
5072 Kokkos::TeamPolicy<typename mj_node_t::execution_space>
5073 policy_one_team(1, Kokkos::AUTO());
5074 typedef typename Kokkos::TeamPolicy<typename mj_node_t::execution_space>
::
5076 Kokkos::parallel_for(policy_one_team, KOKKOS_LAMBDA(member_type team_member) {
5078 mj_scalar_t min_coordinate =
5079 local_global_min_max_coord_total_weight(kk);
5080 mj_scalar_t max_coordinate =
5081 local_global_min_max_coord_total_weight(
5082 kk + current_concurrent_num_parts);
5083 mj_scalar_t global_total_weight =
5084 local_global_min_max_coord_total_weight(
5085 kk + current_concurrent_num_parts * 2);
5087 Kokkos::parallel_for(Kokkos::TeamThreadRange (team_member, num_cuts),
5092 current_global_left_closest_points(i) > local_sEpsilon) {
5093 current_global_left_closest_points(i) =
5094 current_cut_coordinates(i);
5096 if(current_global_right_closest_points(i) -
5097 max_coordinate > local_sEpsilon) {
5098 current_global_right_closest_points(i) =
5099 current_cut_coordinates(i);
5102 team_member.team_barrier();
5104 Kokkos::parallel_for(Kokkos::TeamThreadRange (team_member, num_cuts),
5106 using algMJ_t = AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
5109 mj_scalar_t seen_weight_in_part = 0;
5111 mj_scalar_t expected_weight_in_part = 0;
5113 double imbalance_on_left = 0, imbalance_on_right = 0;
5114 if(local_distribute_points_on_cut_lines) {
5116 local_global_rectilinear_cut_weight(i) = 0;
5117 local_process_rectilinear_cut_weight(i) = 0;
5119 bool bContinue =
false;
5122 if(current_cut_line_determined(i)) {
5123 new_current_cut_coordinates(i) =
5124 current_cut_coordinates(i);
5129 seen_weight_in_part = current_global_part_weights(i * 2);
5132 expected_weight_in_part = current_part_target_weights(i);
5135 imbalance_on_left = algMJ_t::calculate_imbalance(seen_weight_in_part,
5136 expected_weight_in_part);
5139 imbalance_on_right = algMJ_t::calculate_imbalance(global_total_weight -
5140 seen_weight_in_part, global_total_weight - expected_weight_in_part);
5141 bool is_left_imbalance_valid = std::abs(imbalance_on_left) -
5142 used_imbalance_tolerance < local_sEpsilon ;
5143 bool is_right_imbalance_valid = std::abs(imbalance_on_right) -
5144 used_imbalance_tolerance < local_sEpsilon;
5146 if(is_left_imbalance_valid && is_right_imbalance_valid) {
5147 current_cut_line_determined(i) =
true;
5148 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5149 new_current_cut_coordinates(i) = current_cut_coordinates(i);
5151 else if(imbalance_on_left < 0) {
5153 if(local_distribute_points_on_cut_lines) {
5158 if(current_global_part_weights(i * 2 + 1) ==
5159 expected_weight_in_part) {
5161 current_cut_line_determined(i) =
true;
5162 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5165 new_current_cut_coordinates(i) =
5166 current_cut_coordinates(i);
5168 current_part_cut_line_weight_to_put_left(i) =
5169 current_local_part_weights(i * 2 + 1) -
5170 current_local_part_weights(i * 2);
5173 else if(current_global_part_weights(i * 2 + 1) >
5174 expected_weight_in_part) {
5177 current_cut_line_determined(i) =
true;
5178 Kokkos::atomic_add(&view_rectilinear_cut_count(0), 1);
5182 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5183 new_current_cut_coordinates(i) =
5184 current_cut_coordinates(i);
5185 local_process_rectilinear_cut_weight[i] =
5186 current_local_part_weights(i * 2 + 1) -
5187 current_local_part_weights(i * 2);
5196 current_cut_lower_bounds(i) =
5197 current_global_right_closest_points(i);
5200 current_cut_lower_bound_weights(i) = seen_weight_in_part;
5205 for(mj_part_t ii = i + 1; ii <
num_cuts ; ++ii) {
5206 mj_scalar_t p_weight = current_global_part_weights(ii * 2);
5207 mj_scalar_t line_weight =
5208 current_global_part_weights(ii * 2 + 1);
5209 if(p_weight >= expected_weight_in_part) {
5214 if(p_weight == expected_weight_in_part) {
5215 current_cut_upper_bounds(i) =
5216 current_cut_coordinates(ii);
5217 current_cut_upper_weights(i) = p_weight;
5218 current_cut_lower_bounds(i) =
5219 current_cut_coordinates(ii);
5220 current_cut_lower_bound_weights(i) = p_weight;
5221 }
else if(p_weight < current_cut_upper_weights(i)) {
5224 current_cut_upper_bounds(i) =
5225 current_global_left_closest_points(ii);
5226 current_cut_upper_weights(i) = p_weight;
5232 if(line_weight >= expected_weight_in_part) {
5236 current_cut_upper_bounds(i) =
5237 current_cut_coordinates(ii);
5238 current_cut_upper_weights(i) = line_weight;
5239 current_cut_lower_bounds(i) =
5240 current_cut_coordinates(ii);
5241 current_cut_lower_bound_weights(i) = p_weight;
5246 if(p_weight <= expected_weight_in_part && p_weight >=
5247 current_cut_lower_bound_weights(i)) {
5248 current_cut_lower_bounds(i) =
5249 current_global_right_closest_points(ii);
5250 current_cut_lower_bound_weights(i) = p_weight;
5254 mj_scalar_t new_cut_position = 0;
5255 algMJ_t::mj_calculate_new_cut_position(
5256 current_cut_upper_bounds(i),
5257 current_cut_lower_bounds(i),
5258 current_cut_upper_weights(i),
5259 current_cut_lower_bound_weights(i),
5260 expected_weight_in_part, new_cut_position,
5265 if(std::abs(current_cut_coordinates(i) -
5266 new_cut_position) < local_sEpsilon) {
5267 current_cut_line_determined(i) =
true;
5268 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5271 new_current_cut_coordinates(i) =
5272 current_cut_coordinates(i);
5274 new_current_cut_coordinates(i) = new_cut_position;
5280 current_cut_upper_bounds(i) =
5281 current_global_left_closest_points(i);
5282 current_cut_upper_weights(i) =
5283 seen_weight_in_part;
5286 for(
int ii = i - 1; ii >= 0; --ii) {
5287 mj_scalar_t p_weight =
5288 current_global_part_weights(ii * 2);
5289 mj_scalar_t line_weight =
5290 current_global_part_weights(ii * 2 + 1);
5291 if(p_weight <= expected_weight_in_part) {
5292 if(p_weight == expected_weight_in_part) {
5295 current_cut_upper_bounds(i) =
5296 current_cut_coordinates(ii);
5297 current_cut_upper_weights(i) = p_weight;
5298 current_cut_lower_bounds(i) =
5299 current_cut_coordinates(ii);
5300 current_cut_lower_bound_weights(i) = p_weight;
5302 else if(p_weight > current_cut_lower_bound_weights(i)) {
5305 current_cut_lower_bounds(i) =
5306 current_global_right_closest_points(ii);
5307 current_cut_lower_bound_weights(i) = p_weight;
5313 if(line_weight > expected_weight_in_part) {
5314 current_cut_upper_bounds(i) =
5315 current_global_right_closest_points(ii);
5316 current_cut_upper_weights(i) = line_weight;
5326 if(p_weight >= expected_weight_in_part &&
5327 (p_weight < current_cut_upper_weights(i) ||
5328 (p_weight == current_cut_upper_weights(i) &&
5329 current_cut_upper_bounds(i) >
5330 current_global_left_closest_points(ii)))) {
5331 current_cut_upper_bounds(i) =
5332 current_global_left_closest_points(ii);
5333 current_cut_upper_weights(i) = p_weight;
5336 mj_scalar_t new_cut_position = 0;
5337 algMJ_t::mj_calculate_new_cut_position(
5338 current_cut_upper_bounds(i),
5339 current_cut_lower_bounds(i),
5340 current_cut_upper_weights(i),
5341 current_cut_lower_bound_weights(i),
5342 expected_weight_in_part,
5347 if(std::abs(current_cut_coordinates(i) -
5348 new_cut_position) < local_sEpsilon) {
5349 current_cut_line_determined(i) =
true;
5350 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5352 new_current_cut_coordinates(i) =
5353 current_cut_coordinates(i);
5355 new_current_cut_coordinates(i) =
5362 team_member.team_barrier();
5366 mj_part_t rectilinear_cut_count;
5367 Kokkos::parallel_reduce(
"Read bDoingWork",
5368 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>(0, 1),
5369 KOKKOS_LAMBDA(
int dummy,
int & set_single) {
5370 set_single = view_rectilinear_cut_count(0);
5371 }, rectilinear_cut_count);
5373 if(rectilinear_cut_count > 0) {
5374 auto host_local_process_rectilinear_cut_weight =
5375 Kokkos::create_mirror_view(Kokkos::HostSpace(),
5376 local_process_rectilinear_cut_weight);
5377 auto host_local_global_rectilinear_cut_weight =
5378 Kokkos::create_mirror_view(Kokkos::HostSpace(),
5379 local_global_rectilinear_cut_weight);
5380 Kokkos::deep_copy(host_local_process_rectilinear_cut_weight,
5381 local_process_rectilinear_cut_weight);
5382 Kokkos::deep_copy(host_local_global_rectilinear_cut_weight,
5383 local_global_rectilinear_cut_weight);
5384 Teuchos::scan<int,mj_scalar_t>(
5385 *comm, Teuchos::REDUCE_SUM,
5387 host_local_process_rectilinear_cut_weight.data(),
5388 host_local_global_rectilinear_cut_weight.data());
5389 Kokkos::deep_copy(local_process_rectilinear_cut_weight,
5390 host_local_process_rectilinear_cut_weight);
5391 Kokkos::deep_copy(local_global_rectilinear_cut_weight,
5392 host_local_global_rectilinear_cut_weight);
5394 Kokkos::parallel_for(
"finish up mj_get_new_cut_coordinates",
5395 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
5396 KOKKOS_LAMBDA(
int dummy) {
5397 for(mj_part_t i = 0; i <
num_cuts; ++i) {
5399 if(local_global_rectilinear_cut_weight(i) > 0) {
5401 mj_scalar_t expected_part_weight = current_part_target_weights(i);
5403 mj_scalar_t necessary_weight_on_line_for_left =
5404 expected_part_weight - current_global_part_weights(i * 2);
5407 mj_scalar_t my_weight_on_line =
5408 local_process_rectilinear_cut_weight(i);
5412 mj_scalar_t weight_on_line_upto_process_inclusive =
5413 local_global_rectilinear_cut_weight(i);
5417 mj_scalar_t space_to_put_left =
5418 necessary_weight_on_line_for_left -
5419 weight_on_line_upto_process_inclusive;
5422 mj_scalar_t space_left_to_me =
5423 space_to_put_left + my_weight_on_line;
5436 if(space_left_to_me < 0) {
5439 current_part_cut_line_weight_to_put_left(i) = 0;
5441 else if(space_left_to_me >= my_weight_on_line) {
5445 current_part_cut_line_weight_to_put_left(i) =
5452 current_part_cut_line_weight_to_put_left(i) =
5459 view_rectilinear_cut_count(0) = 0;
5463 Kokkos::deep_copy(this->incomplete_cut_count, device_incomplete_cut_count);
5475 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5476 typename mj_part_t,
typename mj_node_t>
5477 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5478 get_processor_num_points_in_parts(
5479 mj_part_t num_procs,
5480 mj_part_t num_parts,
5481 mj_gno_t *&num_points_in_all_processor_parts)
5484 size_t allocation_size = num_parts * (num_procs + 1);
5491 mj_gno_t *num_local_points_in_each_part_to_reduce_sum =
5492 new mj_gno_t[allocation_size];
5496 mj_gno_t *my_local_points_to_reduce_sum =
5497 num_local_points_in_each_part_to_reduce_sum + num_procs * num_parts;
5501 mj_gno_t *my_local_point_counts_in_each_part =
5502 num_local_points_in_each_part_to_reduce_sum + this->myRank * num_parts;
5505 memset(num_local_points_in_each_part_to_reduce_sum, 0,
5506 sizeof(mj_gno_t)*allocation_size);
5508 auto local_new_part_xadj = this->new_part_xadj;
5509 Kokkos::View<mj_gno_t *, typename mj_node_t::device_type> points_per_part(
5510 Kokkos::ViewAllocateWithoutInitializing(
"points per part"), num_parts);
5511 Kokkos::parallel_for(
"get vals on device",
5512 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_gno_t>
5513 (0, num_parts), KOKKOS_LAMBDA(mj_gno_t i) {
5514 points_per_part(i) =
5515 local_new_part_xadj(i) - ((i == 0) ? 0 : local_new_part_xadj(i-1));
5517 auto host_points_per_part = Kokkos::create_mirror_view(points_per_part);
5518 Kokkos::deep_copy(host_points_per_part, points_per_part);
5519 for(
int i = 0; i < num_parts; ++i) {
5520 my_local_points_to_reduce_sum[i] = host_points_per_part(i);
5525 memcpy (my_local_point_counts_in_each_part, my_local_points_to_reduce_sum,
5526 sizeof(mj_gno_t) * (num_parts) );
5533 reduceAll<int, mj_gno_t>(
5535 Teuchos::REDUCE_SUM,
5537 num_local_points_in_each_part_to_reduce_sum,
5538 num_points_in_all_processor_parts);
5542 delete [] num_local_points_in_each_part_to_reduce_sum;
5560 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5561 typename mj_part_t, typename mj_node_t>
5562 bool AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5563 mj_check_to_migrate(
5564 size_t migration_reduce_all_population,
5565 mj_lno_t num_coords_for_last_dim_part,
5566 mj_part_t num_procs,
5567 mj_part_t num_parts,
5568 mj_gno_t *num_points_in_all_processor_parts)
5571 if(migration_reduce_all_population > future_reduceall_cutoff) {
5576 if(num_coords_for_last_dim_part < min_work_last_dim) {
5581 if(this->check_migrate_avoid_migration_option == 0) {
5582 double global_imbalance = 0;
5584 size_t global_shift = num_procs * num_parts;
5586 for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5587 for(mj_part_t i = 0; i < num_parts; ++i) {
5588 double ideal_num = num_points_in_all_processor_parts[global_shift + i]
5589 / double(num_procs);
5591 global_imbalance += std::abs(ideal_num -
5592 num_points_in_all_processor_parts[ii * num_parts + i]) / (ideal_num);
5595 global_imbalance /= num_parts;
5596 global_imbalance /= num_procs;
5598 if(global_imbalance <= this->minimum_migration_imbalance) {
5624 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5625 typename mj_part_t,
typename mj_node_t>
5626 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5627 assign_send_destinations(
5628 mj_part_t num_parts,
5629 mj_part_t *part_assignment_proc_begin_indices,
5630 mj_part_t *processor_chains_in_parts,
5631 mj_lno_t *send_count_to_each_proc,
5632 int *coordinate_destinations) {
5634 auto host_new_part_xadj = Kokkos::create_mirror_view(this->new_part_xadj);
5635 deep_copy(host_new_part_xadj, this->new_part_xadj);
5637 auto host_new_coordinate_permutations =
5638 Kokkos::create_mirror_view(this->new_coordinate_permutations);
5639 deep_copy(host_new_coordinate_permutations, this->new_coordinate_permutations);
5641 for(mj_part_t p = 0; p < num_parts; ++p) {
5642 mj_lno_t part_begin = 0;
5643 if(p > 0) part_begin = host_new_part_xadj(p - 1);
5644 mj_lno_t part_end = host_new_part_xadj(p);
5646 mj_part_t proc_to_sent = part_assignment_proc_begin_indices[p];
5648 mj_lno_t num_total_send = 0;
5649 for(mj_lno_t j=part_begin; j < part_end; j++) {
5650 mj_lno_t local_ind = host_new_coordinate_permutations(j);
5651 while (num_total_send >= send_count_to_each_proc[proc_to_sent]) {
5655 part_assignment_proc_begin_indices[p] =
5656 processor_chains_in_parts[proc_to_sent];
5658 processor_chains_in_parts[proc_to_sent] = -1;
5660 proc_to_sent = part_assignment_proc_begin_indices[p];
5663 coordinate_destinations[local_ind] = proc_to_sent;
5689 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5690 typename mj_part_t,
typename mj_node_t>
5691 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5692 mj_assign_proc_to_parts(
5693 mj_gno_t * num_points_in_all_processor_parts,
5694 mj_part_t num_parts,
5695 mj_part_t num_procs,
5696 mj_lno_t *send_count_to_each_proc,
5697 std::vector<mj_part_t> &processor_ranks_for_subcomm,
5698 std::vector<mj_part_t> *next_future_num_parts_in_parts,
5699 mj_part_t &out_part_index,
5700 mj_part_t &output_part_numbering_begin_index,
5701 int * coordinate_destinations) {
5702 mj_gno_t *global_num_points_in_parts =
5703 num_points_in_all_processor_parts + num_procs * num_parts;
5704 mj_part_t *num_procs_assigned_to_each_part =
new mj_part_t[num_parts];
5707 bool did_i_find_my_group =
false;
5709 mj_part_t num_free_procs = num_procs;
5710 mj_part_t minimum_num_procs_required_for_rest_of_parts = num_parts - 1;
5712 double max_imbalance_difference = 0;
5713 mj_part_t max_differing_part = 0;
5716 for(mj_part_t i = 0; i < num_parts; i++) {
5719 double scalar_required_proc = num_procs *
5720 (double (global_num_points_in_parts[i]) /
5721 double (this->num_global_coords));
5724 mj_part_t required_proc =
5725 static_cast<mj_part_t
> (0.5 + scalar_required_proc);
5726 if(required_proc == 0) required_proc = 1;
5732 required_proc < minimum_num_procs_required_for_rest_of_parts) {
5733 required_proc = num_free_procs -
5734 (minimum_num_procs_required_for_rest_of_parts);
5738 num_free_procs -= required_proc;
5742 --minimum_num_procs_required_for_rest_of_parts;
5745 num_procs_assigned_to_each_part[i] = required_proc;
5750 double imbalance_wrt_ideal =
5751 (scalar_required_proc - required_proc) / required_proc;
5752 if(imbalance_wrt_ideal > max_imbalance_difference) {
5753 max_imbalance_difference = imbalance_wrt_ideal;
5754 max_differing_part = i;
5760 if(num_free_procs > 0) {
5761 num_procs_assigned_to_each_part[max_differing_part] += num_free_procs;
5768 mj_part_t *part_assignment_proc_begin_indices =
new mj_part_t[num_parts];
5772 mj_part_t *processor_chains_in_parts =
new mj_part_t [num_procs];
5773 mj_part_t *processor_part_assignments =
new mj_part_t[num_procs];
5782 for(
int i = 0; i < num_procs; ++i ) {
5783 processor_part_assignments[i] = -1;
5784 processor_chains_in_parts[i] = -1;
5786 for(
int i = 0; i < num_parts; ++i ) {
5787 part_assignment_proc_begin_indices[i] = -1;
5793 uSignedSortItem<mj_part_t, mj_gno_t, char> *
5794 sort_item_num_part_points_in_procs =
5795 new uSignedSortItem<mj_part_t, mj_gno_t, char>[num_procs];
5797 for(mj_part_t i = 0; i < num_parts; ++i) {
5802 for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5803 sort_item_num_part_points_in_procs[ii].id = ii;
5806 if(processor_part_assignments[ii] == -1) {
5807 sort_item_num_part_points_in_procs[ii].val =
5808 num_points_in_all_processor_parts[ii * num_parts + i];
5810 sort_item_num_part_points_in_procs[ii].signbit = 1;
5823 sort_item_num_part_points_in_procs[ii].val =
5824 num_points_in_all_processor_parts[ii * num_parts + i];
5825 sort_item_num_part_points_in_procs[ii].signbit = 0;
5830 uqSignsort<mj_part_t, mj_gno_t,char>
5831 (num_procs, sort_item_num_part_points_in_procs);
5843 mj_part_t required_proc_count = num_procs_assigned_to_each_part[i];
5844 mj_gno_t total_num_points_in_part = global_num_points_in_parts[i];
5845 mj_gno_t ideal_num_points_in_a_proc = Teuchos::as<mj_gno_t>(
5846 ceil(total_num_points_in_part /
double (required_proc_count)));
5849 mj_part_t next_proc_to_send_index = num_procs - required_proc_count;
5850 mj_part_t next_proc_to_send_id =
5851 sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
5852 mj_lno_t space_left_in_sent_proc = ideal_num_points_in_a_proc -
5853 sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
5857 for(mj_part_t ii = num_procs - 1;
5858 ii >= num_procs - required_proc_count; --ii) {
5859 mj_part_t proc_id = sort_item_num_part_points_in_procs[ii].id;
5861 processor_part_assignments[proc_id] = i;
5864 bool did_change_sign =
false;
5866 for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5869 if(sort_item_num_part_points_in_procs[ii].signbit == 0) {
5870 did_change_sign =
true;
5871 sort_item_num_part_points_in_procs[ii].signbit = 1;
5878 if(did_change_sign) {
5881 uqSignsort<mj_part_t, mj_gno_t>(num_procs - required_proc_count,
5882 sort_item_num_part_points_in_procs);
5897 if(!did_i_find_my_group) {
5898 for(mj_part_t ii = num_procs - 1; ii >=
5899 num_procs - required_proc_count; --ii) {
5901 mj_part_t proc_id_to_assign = sort_item_num_part_points_in_procs[ii].id;
5904 processor_ranks_for_subcomm.push_back(proc_id_to_assign);
5906 if(proc_id_to_assign == this->myRank) {
5908 did_i_find_my_group =
true;
5911 part_assignment_proc_begin_indices[i] = this->myRank;
5912 processor_chains_in_parts[this->myRank] = -1;
5916 send_count_to_each_proc[this->myRank] =
5917 sort_item_num_part_points_in_procs[ii].val;
5921 for(mj_part_t in = 0; in < i; ++in) {
5922 output_part_numbering_begin_index +=
5923 (*next_future_num_parts_in_parts)[in];
5931 if(!did_i_find_my_group) {
5932 processor_ranks_for_subcomm.clear();
5940 for(mj_part_t ii = num_procs - required_proc_count - 1; ii >= 0; --ii) {
5941 mj_part_t nonassigned_proc_id =
5942 sort_item_num_part_points_in_procs[ii].id;
5943 mj_lno_t num_points_to_sent =
5944 sort_item_num_part_points_in_procs[ii].val;
5950 if(num_points_to_sent < 0) {
5951 cout <<
"Migration - processor assignments - for part:" << i
5952 <<
"from proc:" << nonassigned_proc_id <<
" num_points_to_sent:"
5953 << num_points_to_sent << std::endl;
5958 switch (migration_type) {
5962 while (num_points_to_sent > 0) {
5964 if(num_points_to_sent <= space_left_in_sent_proc) {
5966 space_left_in_sent_proc -= num_points_to_sent;
5968 if(this->myRank == nonassigned_proc_id) {
5970 send_count_to_each_proc[next_proc_to_send_id] =
5975 mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
5976 part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
5977 processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
5979 num_points_to_sent = 0;
5983 if(space_left_in_sent_proc > 0) {
5984 num_points_to_sent -= space_left_in_sent_proc;
5987 if(this->myRank == nonassigned_proc_id) {
5989 send_count_to_each_proc[next_proc_to_send_id] =
5990 space_left_in_sent_proc;
5991 mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
5992 part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
5993 processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
5997 ++next_proc_to_send_index;
6000 if(next_part_to_send_index < nprocs - required_proc_count ) {
6001 cout <<
"Migration - processor assignments - for part:"
6003 <<
" next_part_to_send :" << next_part_to_send_index
6004 <<
" nprocs:" << nprocs
6005 <<
" required_proc_count:" << required_proc_count
6006 <<
" Error: next_part_to_send_index <" <<
6007 <<
" nprocs - required_proc_count" << std::endl;
6012 next_proc_to_send_id =
6013 sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
6015 space_left_in_sent_proc = ideal_num_points_in_a_proc -
6016 sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
6027 if(this->myRank == nonassigned_proc_id) {
6029 send_count_to_each_proc[next_proc_to_send_id] = num_points_to_sent;
6033 mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
6034 part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
6035 processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
6037 num_points_to_sent = 0;
6038 ++next_proc_to_send_index;
6042 if(next_proc_to_send_index == num_procs) {
6043 next_proc_to_send_index = num_procs - required_proc_count;
6046 next_proc_to_send_id =
6047 sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
6049 space_left_in_sent_proc = ideal_num_points_in_a_proc -
6050 sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
6063 this->assign_send_destinations(
6065 part_assignment_proc_begin_indices,
6066 processor_chains_in_parts,
6067 send_count_to_each_proc,
6068 coordinate_destinations);
6069 delete [] part_assignment_proc_begin_indices;
6070 delete [] processor_chains_in_parts;
6071 delete [] processor_part_assignments;
6072 delete [] sort_item_num_part_points_in_procs;
6073 delete [] num_procs_assigned_to_each_part;
6091 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6092 typename mj_part_t,
typename mj_node_t>
6093 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6094 assign_send_destinations2(
6095 mj_part_t num_parts,
6096 uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment,
6097 int *coordinate_destinations,
6098 mj_part_t &output_part_numbering_begin_index,
6099 std::vector<mj_part_t> *next_future_num_parts_in_parts)
6101 mj_part_t part_shift_amount = output_part_numbering_begin_index;
6102 mj_part_t previous_processor = -1;
6104 auto local_new_part_xadj = Kokkos::create_mirror_view(this->new_part_xadj);
6105 Kokkos::deep_copy(local_new_part_xadj, this->new_part_xadj);
6107 auto local_new_coordinate_permutations =
6108 Kokkos::create_mirror_view(this->new_coordinate_permutations);
6109 Kokkos::deep_copy(local_new_coordinate_permutations,
6110 this->new_coordinate_permutations);
6112 for(mj_part_t i = 0; i < num_parts; ++i) {
6113 mj_part_t p = sort_item_part_to_proc_assignment[i].id;
6116 mj_lno_t part_begin_index = 0;
6119 part_begin_index = local_new_part_xadj(p - 1);
6122 mj_lno_t part_end_index = local_new_part_xadj(p);
6124 mj_part_t assigned_proc = sort_item_part_to_proc_assignment[i].val;
6125 if(this->myRank == assigned_proc && previous_processor != assigned_proc) {
6126 output_part_numbering_begin_index = part_shift_amount;
6128 previous_processor = assigned_proc;
6129 part_shift_amount += (*next_future_num_parts_in_parts)[p];
6131 for(mj_lno_t j= part_begin_index; j < part_end_index; j++) {
6132 mj_lno_t localInd = local_new_coordinate_permutations(j);
6133 coordinate_destinations[localInd] = assigned_proc;
6159 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6160 typename mj_part_t,
typename mj_node_t>
6161 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6162 mj_assign_parts_to_procs(
6163 mj_gno_t * num_points_in_all_processor_parts,
6164 mj_part_t num_parts,
6165 mj_part_t num_procs,
6166 mj_lno_t *send_count_to_each_proc,
6167 std::vector<mj_part_t> *next_future_num_parts_in_parts,
6168 mj_part_t &out_num_part,
6169 std::vector<mj_part_t> &out_part_indices,
6170 mj_part_t &output_part_numbering_begin_index,
6171 int *coordinate_destinations) {
6174 mj_gno_t *global_num_points_in_parts =
6175 num_points_in_all_processor_parts + num_procs * num_parts;
6176 out_part_indices.clear();
6180 uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment =
6181 new uSortItem<mj_part_t, mj_part_t>[num_parts];
6182 uSortItem<mj_part_t, mj_gno_t> * sort_item_num_points_of_proc_in_part_i =
6183 new uSortItem<mj_part_t, mj_gno_t>[num_procs];
6187 mj_lno_t work_each =
6188 mj_lno_t (this->num_global_coords / (
double (num_procs)) + 0.5f);
6192 mj_lno_t *space_in_each_processor =
new mj_lno_t[num_procs];
6195 for(mj_part_t i = 0; i < num_procs; ++i) {
6196 space_in_each_processor[i] = work_each;
6203 mj_part_t *num_parts_proc_assigned =
new mj_part_t[num_procs];
6204 memset(num_parts_proc_assigned, 0,
sizeof(mj_part_t) * num_procs);
6205 int empty_proc_count = num_procs;
6209 uSortItem<mj_part_t, mj_gno_t> * sort_item_point_counts_in_parts =
6210 new uSortItem<mj_part_t, mj_gno_t>[num_parts];
6215 for(mj_part_t i = 0; i < num_parts; ++i) {
6216 sort_item_point_counts_in_parts[i].id = i;
6217 sort_item_point_counts_in_parts[i].val = global_num_points_in_parts[i];
6221 uqsort<mj_part_t, mj_gno_t>(num_parts, sort_item_point_counts_in_parts);
6226 for(mj_part_t j = 0; j < num_parts; ++j) {
6228 mj_part_t i = sort_item_point_counts_in_parts[num_parts - 1 - j].id;
6231 mj_gno_t load = global_num_points_in_parts[i];
6234 mj_part_t assigned_proc = -1;
6237 for(mj_part_t ii = 0; ii < num_procs; ++ii) {
6238 sort_item_num_points_of_proc_in_part_i[ii].id = ii;
6243 if(empty_proc_count < num_parts - j ||
6244 num_parts_proc_assigned[ii] == 0) {
6246 sort_item_num_points_of_proc_in_part_i[ii].val =
6247 num_points_in_all_processor_parts[ii * num_parts + i];
6250 sort_item_num_points_of_proc_in_part_i[ii].val = -1;
6254 uqsort<mj_part_t, mj_gno_t>(num_procs,
6255 sort_item_num_points_of_proc_in_part_i);
6258 for(mj_part_t iii = num_procs - 1; iii >= 0; --iii) {
6259 mj_part_t ii = sort_item_num_points_of_proc_in_part_i[iii].id;
6260 if(assigned_proc == -1 ||
6261 (space_in_each_processor[ii] > space_in_each_processor[assigned_proc])) {
6264 else if(space_in_each_processor[ii] == space_in_each_processor[assigned_proc]) {
6265 if(ii < assigned_proc) {
6281 if(num_parts_proc_assigned[assigned_proc]++ == 0) {
6285 space_in_each_processor[assigned_proc] -= load;
6287 sort_item_part_to_proc_assignment[j].id = i;
6290 sort_item_part_to_proc_assignment[j].val = assigned_proc;
6293 if(assigned_proc == this->myRank) {
6295 out_part_indices.push_back(i);
6301 send_count_to_each_proc[assigned_proc] +=
6302 num_points_in_all_processor_parts[this->myRank * num_parts + i];
6305 delete [] num_parts_proc_assigned;
6306 delete [] sort_item_num_points_of_proc_in_part_i;
6307 delete [] sort_item_point_counts_in_parts;
6308 delete [] space_in_each_processor;
6311 uqsort<mj_part_t, mj_part_t>(num_parts, sort_item_part_to_proc_assignment);
6314 this->assign_send_destinations2(
6316 sort_item_part_to_proc_assignment,
6317 coordinate_destinations,
6318 output_part_numbering_begin_index,
6319 next_future_num_parts_in_parts);
6321 delete [] sort_item_part_to_proc_assignment;
6348 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6349 typename mj_part_t,
typename mj_node_t>
6350 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6351 mj_migration_part_proc_assignment(
6352 mj_gno_t * num_points_in_all_processor_parts,
6353 mj_part_t num_parts,
6354 mj_part_t num_procs,
6355 mj_lno_t *send_count_to_each_proc,
6356 std::vector<mj_part_t> &processor_ranks_for_subcomm,
6357 std::vector<mj_part_t> *next_future_num_parts_in_parts,
6358 mj_part_t &out_num_part,
6359 std::vector<mj_part_t> &out_part_indices,
6360 mj_part_t &output_part_numbering_begin_index,
6361 int *coordinate_destinations)
6363 processor_ranks_for_subcomm.clear();
6365 if(num_procs > num_parts) {
6370 mj_part_t out_part_index = 0;
6372 this->mj_assign_proc_to_parts(
6373 num_points_in_all_processor_parts,
6376 send_count_to_each_proc,
6377 processor_ranks_for_subcomm,
6378 next_future_num_parts_in_parts,
6380 output_part_numbering_begin_index,
6381 coordinate_destinations
6385 out_part_indices.clear();
6386 out_part_indices.push_back(out_part_index);
6393 processor_ranks_for_subcomm.push_back(this->myRank);
6398 this->mj_assign_parts_to_procs(
6399 num_points_in_all_processor_parts,
6402 send_count_to_each_proc,
6403 next_future_num_parts_in_parts,
6406 output_part_numbering_begin_index,
6407 coordinate_destinations);
6424 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6425 typename mj_part_t,
typename mj_node_t>
6426 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6428 mj_part_t num_procs,
6429 mj_lno_t &num_new_local_points,
6430 std::string iteration,
6431 int *coordinate_destinations,
6432 mj_part_t num_parts)
6435 #ifdef ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
6436 if(
sizeof(mj_lno_t) <=
sizeof(
int)) {
6440 ZOLTAN_COMM_OBJ *plan = NULL;
6441 MPI_Comm mpi_comm = Teuchos::getRawMpiComm(*(this->comm));
6442 int num_incoming_gnos = 0;
6443 int message_tag = 7859;
6446 mj_timer_base_string +
"Migration Z1PlanCreating-" + iteration);
6447 int ierr = Zoltan_Comm_Create(
6449 int(this->num_local_coords),
6450 coordinate_destinations,
6453 &num_incoming_gnos);
6457 mj_timer_base_string +
"Migration Z1PlanCreating-" + iteration);
6460 mj_timer_base_string +
"Migration Z1Migration-" + iteration);
6468 auto host_current_mj_gnos = Kokkos::create_mirror_view(
6469 Kokkos::HostSpace(), this->current_mj_gnos);
6470 Kokkos::deep_copy(host_current_mj_gnos, this->current_mj_gnos);
6471 Kokkos::View<mj_gno_t*, device_t> dst_gnos(
6472 Kokkos::ViewAllocateWithoutInitializing(
"dst_gnos"), num_incoming_gnos);
6473 auto host_dst_gnos = Kokkos::create_mirror_view(
6474 Kokkos::HostSpace(), dst_gnos);
6476 ierr = Zoltan_Comm_Do(
6479 (
char *) host_current_mj_gnos.data(),
6481 (
char *) host_dst_gnos.data());
6483 Kokkos::deep_copy(dst_gnos, host_dst_gnos);
6484 this->current_mj_gnos = dst_gnos;
6490 auto host_src_coordinates = Kokkos::create_mirror_view(
6491 Kokkos::HostSpace(), this->mj_coordinates);
6492 Kokkos::deep_copy(host_src_coordinates, this->mj_coordinates);
6493 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
6494 dst_coordinates(Kokkos::ViewAllocateWithoutInitializing(
"mj_coordinates"),
6495 num_incoming_gnos, this->coord_dim);
6496 auto host_dst_coordinates = Kokkos::create_mirror_view(
6497 Kokkos::HostSpace(), dst_coordinates);
6498 for(
int i = 0; i < this->coord_dim; ++i) {
6499 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> sub_host_src_coordinates
6500 = Kokkos::subview(host_src_coordinates, Kokkos::ALL, i);
6501 Kokkos::View<mj_scalar_t *, Kokkos::HostSpace> sub_host_dst_coordinates
6502 = Kokkos::subview(host_dst_coordinates, Kokkos::ALL, i);
6505 ierr = Zoltan_Comm_Do(
6508 (
char *) sub_host_src_coordinates.data(),
6509 sizeof(mj_scalar_t),
6510 (
char *) sub_host_dst_coordinates.data());
6513 deep_copy(dst_coordinates, host_dst_coordinates);
6514 this->mj_coordinates = dst_coordinates;
6519 auto host_src_weights = Kokkos::create_mirror_view(
6520 Kokkos::HostSpace(), this->mj_weights);
6521 Kokkos::deep_copy(host_src_weights, this->mj_weights);
6522 Kokkos::View<mj_scalar_t**, device_t> dst_weights(
6523 Kokkos::ViewAllocateWithoutInitializing(
"mj_weights"),
6524 num_incoming_gnos, this->num_weights_per_coord);
6525 auto host_dst_weights = Kokkos::create_mirror_view(dst_weights);
6526 for(
int i = 0; i < this->num_weights_per_coord; ++i) {
6527 auto sub_host_src_weights
6528 = Kokkos::subview(host_src_weights, Kokkos::ALL, i);
6529 auto sub_host_dst_weights
6530 = Kokkos::subview(host_dst_weights, Kokkos::ALL, i);
6531 ArrayRCP<mj_scalar_t> sent_weight(this->num_local_coords);
6533 for(mj_lno_t n = 0; n < this->num_local_coords; ++n) {
6534 sent_weight[n] = sub_host_src_weights(n);
6536 ArrayRCP<mj_scalar_t> received_weight(num_incoming_gnos);
6538 ierr = Zoltan_Comm_Do(
6541 (
char *) sent_weight.getRawPtr(),
6542 sizeof(mj_scalar_t),
6543 (
char *) received_weight.getRawPtr());
6546 for(mj_lno_t n = 0; n < num_incoming_gnos; ++n) {
6547 sub_host_dst_weights(n) = received_weight[n];
6550 deep_copy(dst_weights, host_dst_weights);
6551 this->mj_weights = dst_weights;
6557 Kokkos::View<int *, Kokkos::HostSpace> dst_owners_of_coordinate(
6558 Kokkos::ViewAllocateWithoutInitializing(
"owner_of_coordinate"),
6561 ierr = Zoltan_Comm_Do(
6564 (
char *) owner_of_coordinate.data(),
6566 (
char *) dst_owners_of_coordinate.data());
6568 this->owner_of_coordinate = dst_owners_of_coordinate;
6575 auto host_src_assigned_part_ids = Kokkos::create_mirror_view(
6576 Kokkos::HostSpace(), this->assigned_part_ids);
6577 Kokkos::deep_copy(host_src_assigned_part_ids, this->assigned_part_ids);
6578 Kokkos::View<int *, device_t> dst_assigned_part_ids(
6579 Kokkos::ViewAllocateWithoutInitializing(
"assigned_part_ids"),
6581 auto host_dst_assigned_part_ids = Kokkos::create_mirror_view(
6582 Kokkos::HostSpace(), dst_assigned_part_ids);
6583 mj_part_t *new_parts =
new mj_part_t[num_incoming_gnos];
6584 if(num_procs < num_parts) {
6586 ierr = Zoltan_Comm_Do(
6589 (
char *) host_src_assigned_part_ids.data(),
6591 (
char *) host_dst_assigned_part_ids.data());
6593 Kokkos::deep_copy(dst_assigned_part_ids, host_dst_assigned_part_ids);
6597 this->assigned_part_ids = dst_assigned_part_ids;
6600 ierr = Zoltan_Comm_Destroy(&plan);
6602 num_new_local_points = num_incoming_gnos;
6604 mj_timer_base_string +
"Migration Z1Migration-" + iteration);
6607 #endif // ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
6609 this->mj_env->timerStart(
MACRO_TIMERS, mj_timer_base_string +
6610 "Migration DistributorPlanCreating-" + iteration);
6612 Tpetra::Distributor distributor(this->comm);
6613 ArrayView<const mj_part_t> destinations( coordinate_destinations,
6614 this->num_local_coords);
6615 mj_lno_t num_incoming_gnos = distributor.createFromSends(destinations);
6616 this->mj_env->timerStop(
MACRO_TIMERS, mj_timer_base_string +
6617 "Migration DistributorPlanCreating-" + iteration);
6618 this->mj_env->timerStart(
MACRO_TIMERS, mj_timer_base_string +
6619 "Migration DistributorMigration-" + iteration);
6627 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> received_gnos(
6628 Kokkos::ViewAllocateWithoutInitializing(
"received_gnos"),
6631 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> sent_gnos(
6632 Kokkos::ViewAllocateWithoutInitializing(
"sent_gnos"),
6633 this->current_mj_gnos.extent(0));
6634 Kokkos::deep_copy(sent_gnos, this->current_mj_gnos);
6636 distributor.doPostsAndWaits(sent_gnos, 1, received_gnos);
6638 this->current_mj_gnos = Kokkos::View<mj_gno_t*, device_t>(
6639 Kokkos::ViewAllocateWithoutInitializing(
"gids"), num_incoming_gnos);
6641 Kokkos::deep_copy(this->current_mj_gnos, received_gnos);
6646 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
6647 dst_coordinates(
"mj_coordinates", num_incoming_gnos, this->coord_dim);
6649 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, Kokkos::HostSpace>
6650 host_src_coordinates(
6651 Kokkos::ViewAllocateWithoutInitializing(
"host_coords"),
6652 this->mj_coordinates.extent(0), this->mj_coordinates.extent(1));
6653 Kokkos::deep_copy(host_src_coordinates, this->mj_coordinates);
6655 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> received_coord(
6656 Kokkos::ViewAllocateWithoutInitializing(
"received_coord"),
6659 for(
int i = 0; i < this->coord_dim; ++i) {
6663 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> sent_coord
6664 = Kokkos::subview(host_src_coordinates, Kokkos::ALL, i);
6666 distributor.doPostsAndWaits(sent_coord, 1, received_coord);
6668 Kokkos::deep_copy(Kokkos::subview(dst_coordinates, Kokkos::ALL, i),
6674 this->mj_coordinates = dst_coordinates;
6677 Kokkos::View<mj_scalar_t**, device_t> dst_weights(
6678 "mj_weights", num_incoming_gnos, this->num_weights_per_coord);
6679 auto host_dst_weights = Kokkos::create_mirror_view(Kokkos::HostSpace(),
6682 auto host_src_weights = Kokkos::create_mirror_view_and_copy(
6683 Kokkos::HostSpace(), this->mj_weights);
6686 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> sent_weight(
6687 Kokkos::ViewAllocateWithoutInitializing(
"send_weight_buffer"),
6688 this->num_local_coords);
6690 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> received_weight(
6691 Kokkos::ViewAllocateWithoutInitializing(
"received_weight_buffer"),
6694 for(
int i = 0; i < this->num_weights_per_coord; ++i) {
6696 auto sub_host_src_weights
6697 = Kokkos::subview(host_src_weights, Kokkos::ALL, i);
6699 auto sub_host_dst_weights
6700 = Kokkos::subview(host_dst_weights, Kokkos::ALL, i);
6707 for(mj_lno_t n = 0; n < this->num_local_coords; ++n) {
6708 sent_weight[n] = sub_host_src_weights(n);
6711 distributor.doPostsAndWaits(sent_weight, 1, received_weight);
6714 for(mj_lno_t n = 0; n < num_incoming_gnos; ++n) {
6715 sub_host_dst_weights(n) = received_weight[n];
6718 Kokkos::deep_copy(dst_weights, host_dst_weights);
6719 this->mj_weights = dst_weights;
6724 Kokkos::View<int *, Kokkos::HostSpace> received_owners(
6725 Kokkos::ViewAllocateWithoutInitializing(
"owner_of_coordinate"),
6728 distributor.doPostsAndWaits(owner_of_coordinate, 1, received_owners);
6730 this->owner_of_coordinate = received_owners;
6736 if(num_procs < num_parts) {
6737 Kokkos::View<mj_part_t*, Kokkos::HostSpace> sent_partids(
6738 Kokkos::ViewAllocateWithoutInitializing(
"host_parts"),
6739 this->assigned_part_ids.extent(0));
6740 Kokkos::deep_copy(sent_partids, assigned_part_ids);
6742 Kokkos::View<mj_part_t*, Kokkos::HostSpace> received_partids(
6743 Kokkos::ViewAllocateWithoutInitializing(
"received_partids"),
6746 distributor.doPostsAndWaits(sent_partids, 1, received_partids);
6748 this->assigned_part_ids = Kokkos::View<mj_part_t *, device_t>
6749 (
"assigned_part_ids", num_incoming_gnos);
6750 Kokkos::deep_copy(this->assigned_part_ids, received_partids);
6753 this->assigned_part_ids = Kokkos::View<mj_part_t *, device_t>
6754 (
"assigned_part_ids", num_incoming_gnos);
6756 this->mj_env->timerStop(
MACRO_TIMERS,
"" + mj_timer_base_string +
6757 "Migration DistributorMigration-" + iteration);
6759 num_new_local_points = num_incoming_gnos;
6768 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6769 typename mj_part_t,
typename mj_node_t>
6770 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6771 create_sub_communicator(std::vector<mj_part_t> &processor_ranks_for_subcomm)
6773 mj_part_t group_size = processor_ranks_for_subcomm.size();
6774 mj_part_t *ids =
new mj_part_t[group_size];
6775 for(mj_part_t i = 0; i < group_size; ++i) {
6776 ids[i] = processor_ranks_for_subcomm[i];
6778 ArrayView<const mj_part_t> idView(ids, group_size);
6779 this->comm = this->comm->createSubcommunicator(idView);
6788 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6789 typename mj_part_t,
typename mj_node_t>
6790 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6791 fill_permutation_array(
6792 mj_part_t output_num_parts,
6793 mj_part_t num_parts)
6796 if(output_num_parts == 1) {
6797 auto local_new_coordinate_permutations = this->new_coordinate_permutations;
6798 Kokkos::parallel_for(
6799 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
6800 (0, this->num_local_coords),
6801 KOKKOS_LAMBDA(mj_lno_t i) {
6802 local_new_coordinate_permutations(i) = i;
6804 auto local_new_part_xadj = this->new_part_xadj;
6805 auto local_num_local_coords = this->num_local_coords;
6806 Kokkos::parallel_for(
6807 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0,1),
6808 KOKKOS_LAMBDA(
int dummy) {
6809 local_new_part_xadj(0) = local_num_local_coords;
6813 auto local_num_local_coords = this->num_local_coords;
6814 auto local_assigned_part_ids = this->assigned_part_ids;
6815 auto local_new_part_xadj = this->new_part_xadj;
6816 auto local_new_coordinate_permutations = this->new_coordinate_permutations;
6819 Kokkos::View<mj_part_t*, device_t> part_shifts(
"part_shifts", num_parts);
6824 Kokkos::View<mj_lno_t*, device_t> num_points_in_parts(
6825 "num_points_in_parts", num_parts);
6827 Kokkos::parallel_for(
6828 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0,1),
6829 KOKKOS_LAMBDA(
int dummy) {
6831 for(mj_lno_t i = 0; i < local_num_local_coords; ++i) {
6832 mj_part_t ii = local_assigned_part_ids(i);
6833 ++num_points_in_parts(ii);
6838 mj_lno_t prev_index = 0;
6839 for(mj_part_t i = 0; i < num_parts; ++i) {
6840 if(num_points_in_parts(i) > 0) {
6841 local_new_part_xadj(p) = prev_index + num_points_in_parts(i);
6842 prev_index += num_points_in_parts(i);
6843 part_shifts(i) = p++;
6848 mj_part_t assigned_num_parts = p - 1;
6849 for(;p < num_parts; ++p) {
6850 local_new_part_xadj(p) =
6851 local_new_part_xadj(assigned_num_parts);
6853 for(mj_part_t i = 0; i < output_num_parts; ++i) {
6854 num_points_in_parts(i) = local_new_part_xadj(i);
6860 for(mj_lno_t i = local_num_local_coords - 1; i >= 0; --i) {
6862 part_shifts[mj_part_t(local_assigned_part_ids(i))];
6863 local_new_coordinate_permutations(--num_points_in_parts[part]) = i;
6893 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6894 typename mj_part_t,
typename mj_node_t>
6895 bool AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6896 mj_perform_migration(
6897 mj_part_t input_num_parts,
6898 mj_part_t &output_num_parts,
6899 std::vector<mj_part_t> *next_future_num_parts_in_parts,
6900 mj_part_t &output_part_begin_index,
6901 size_t migration_reduce_all_population,
6902 mj_lno_t num_coords_for_last_dim_part,
6903 std::string iteration,
6904 RCP<mj_partBoxVector_t> &input_part_boxes,
6905 RCP<mj_partBoxVector_t> &output_part_boxes)
6907 mj_part_t num_procs = this->comm->getSize();
6908 this->myRank = this->comm->getRank();
6913 mj_gno_t *num_points_in_all_processor_parts =
6914 new mj_gno_t[input_num_parts * (num_procs + 1)];
6917 this->get_processor_num_points_in_parts(
6920 num_points_in_all_processor_parts);
6923 if(!this->mj_check_to_migrate(
6924 migration_reduce_all_population,
6925 num_coords_for_last_dim_part,
6928 num_points_in_all_processor_parts)) {
6929 delete [] num_points_in_all_processor_parts;
6933 mj_lno_t *send_count_to_each_proc = NULL;
6934 int *coordinate_destinations =
new int[this->num_local_coords];
6935 send_count_to_each_proc =
new mj_lno_t[num_procs];
6937 for(
int i = 0; i < num_procs; ++i) {
6938 send_count_to_each_proc[i] = 0;
6941 std::vector<mj_part_t> processor_ranks_for_subcomm;
6942 std::vector<mj_part_t> out_part_indices;
6945 this->mj_migration_part_proc_assignment(
6946 num_points_in_all_processor_parts,
6949 send_count_to_each_proc,
6950 processor_ranks_for_subcomm,
6951 next_future_num_parts_in_parts,
6954 output_part_begin_index,
6955 coordinate_destinations);
6957 delete [] send_count_to_each_proc;
6958 std::vector <mj_part_t> tmpv;
6960 std::sort (out_part_indices.begin(), out_part_indices.end());
6961 mj_part_t outP = out_part_indices.size();
6962 mj_gno_t new_global_num_points = 0;
6963 mj_gno_t *global_num_points_in_parts =
6964 num_points_in_all_processor_parts + num_procs * input_num_parts;
6966 if(this->mj_keep_part_boxes) {
6967 input_part_boxes->clear();
6972 for(mj_part_t i = 0; i < outP; ++i) {
6973 mj_part_t ind = out_part_indices[i];
6974 new_global_num_points += global_num_points_in_parts[ind];
6975 tmpv.push_back((*next_future_num_parts_in_parts)[ind]);
6976 if(this->mj_keep_part_boxes) {
6977 input_part_boxes->push_back((*output_part_boxes)[ind]);
6982 if(this->mj_keep_part_boxes) {
6983 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
6984 input_part_boxes = output_part_boxes;
6985 output_part_boxes = tmpPartBoxes;
6987 next_future_num_parts_in_parts->clear();
6988 for(mj_part_t i = 0; i < outP; ++i) {
6989 mj_part_t p = tmpv[i];
6990 next_future_num_parts_in_parts->push_back(p);
6993 delete [] num_points_in_all_processor_parts;
6995 mj_lno_t num_new_local_points = 0;
6997 this->mj_migrate_coords(
6999 num_new_local_points,
7001 coordinate_destinations,
7004 delete [] coordinate_destinations;
7005 if(this->num_local_coords != num_new_local_points) {
7006 this->new_coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>
7007 (Kokkos::ViewAllocateWithoutInitializing(
"new_coordinate_permutations"),
7008 num_new_local_points);
7009 this->coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>
7010 (Kokkos::ViewAllocateWithoutInitializing(
"coordinate_permutations"),
7011 num_new_local_points);
7013 this->num_local_coords = num_new_local_points;
7014 this->num_global_coords = new_global_num_points;
7017 this->create_sub_communicator(processor_ranks_for_subcomm);
7019 processor_ranks_for_subcomm.clear();
7022 this->fill_permutation_array(output_num_parts, input_num_parts);
7045 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
7046 typename mj_part_t,
typename mj_node_t>
7047 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
7048 create_consistent_chunks(
7049 mj_part_t num_parts,
7050 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
7051 Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
7052 mj_lno_t coordinate_begin,
7053 mj_lno_t coordinate_end,
7054 Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
7055 Kokkos::View<mj_lno_t *, device_t> & out_part_xadj,
7057 bool longest_dim_part,
7058 uSignedSortItem<int, mj_scalar_t, char> * p_coord_dimension_range_sorted)
7068 mj_part_t no_cuts = num_parts - 1;
7072 if(this->distribute_points_on_cut_lines) {
7073 auto local_thread_cut_line_weight_to_put_left =
7074 this->thread_cut_line_weight_to_put_left;
7075 auto local_thread_part_weight_work =
7076 this->thread_part_weight_work;
7077 auto local_sEpsilon = this->
sEpsilon;
7079 Kokkos::parallel_for(
7080 Kokkos::RangePolicy<
typename mj_node_t::execution_space,
7081 mj_part_t> (0, no_cuts), KOKKOS_LAMBDA (mj_part_t i) {
7083 mj_scalar_t left_weight = used_local_cut_line_weight_to_left(i);
7084 if(left_weight > local_sEpsilon) {
7086 mj_scalar_t thread_ii_weight_on_cut =
7087 local_thread_part_weight_work(i * 2 + 1) -
7088 local_thread_part_weight_work(i * 2);
7089 if(thread_ii_weight_on_cut < left_weight) {
7090 local_thread_cut_line_weight_to_put_left(i) =
7091 thread_ii_weight_on_cut;
7094 local_thread_cut_line_weight_to_put_left(i) = left_weight;
7098 local_thread_cut_line_weight_to_put_left(i) = 0;
7103 auto local_least_signifiance = least_signifiance;
7104 auto local_significance_mul = significance_mul;
7105 Kokkos::parallel_for(
7106 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
7107 (0, 1), KOKKOS_LAMBDA (
int dummy) {
7111 for(mj_part_t i = no_cuts - 1; i > 0 ; --i) {
7112 mj_scalar_t cut1 = current_concurrent_cut_coordinate(i-1);
7113 mj_scalar_t cut2 = current_concurrent_cut_coordinate(i);
7114 mj_scalar_t delta = cut2 - cut1;
7115 mj_scalar_t abs_delta = (delta > 0) ? delta : -delta;
7116 if(abs_delta < local_sEpsilon) {
7117 local_thread_cut_line_weight_to_put_left(i) -=
7118 local_thread_cut_line_weight_to_put_left(i - 1);
7120 local_thread_cut_line_weight_to_put_left(i) =
7121 static_cast<long long>((local_thread_cut_line_weight_to_put_left(i) +
7122 local_least_signifiance) * local_significance_mul) /
7123 static_cast<mj_scalar_t
>(local_significance_mul);
7129 auto local_thread_point_counts = this->thread_point_counts;
7130 Kokkos::parallel_for(
7131 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
7132 (0, num_parts), KOKKOS_LAMBDA (mj_part_t i) {
7133 local_thread_point_counts(i) = 0;
7144 mj_part_t *cut_map =
new mj_part_t[no_cuts];
7146 typedef uMultiSortItem<mj_lno_t, int, mj_scalar_t> multiSItem;
7147 typedef std::vector< multiSItem > multiSVector;
7148 typedef std::vector<multiSVector> multiS2Vector;
7151 std::vector<mj_scalar_t *>allocated_memory;
7154 multiS2Vector sort_vector_points_on_cut;
7157 mj_part_t different_cut_count = 1;
7163 multiSVector tmpMultiSVector;
7164 sort_vector_points_on_cut.push_back(tmpMultiSVector);
7166 auto local_current_concurrent_cut_coordinate =
7167 current_concurrent_cut_coordinate;
7168 auto host_current_concurrent_cut_coordinate =
7169 Kokkos::create_mirror_view(local_current_concurrent_cut_coordinate);
7170 Kokkos::deep_copy(host_current_concurrent_cut_coordinate,
7171 local_current_concurrent_cut_coordinate);
7173 for(mj_part_t i = 1; i < no_cuts ; ++i) {
7176 if(std::abs(host_current_concurrent_cut_coordinate(i) -
7177 host_current_concurrent_cut_coordinate(i-1)) < this->sEpsilon) {
7178 cut_map[i] = cut_map[i-1];
7181 cut_map[i] = different_cut_count++;
7182 multiSVector tmp2MultiSVector;
7183 sort_vector_points_on_cut.push_back(tmp2MultiSVector);
7186 Kokkos::deep_copy(current_concurrent_cut_coordinate,
7187 host_current_concurrent_cut_coordinate);
7190 auto host_coordinate_permutations =
7191 Kokkos::create_mirror_view(coordinate_permutations);
7192 Kokkos::deep_copy(host_coordinate_permutations, coordinate_permutations);
7194 auto host_assigned_part_ids = Kokkos::create_mirror_view(assigned_part_ids);
7195 Kokkos::deep_copy(host_assigned_part_ids, assigned_part_ids);
7197 auto host_mj_coordinates = Kokkos::create_mirror_view(mj_coordinates);
7198 Kokkos::deep_copy(host_mj_coordinates, mj_coordinates);
7200 auto host_thread_point_counts = Kokkos::create_mirror_view(thread_point_counts);
7201 Kokkos::deep_copy(host_thread_point_counts, thread_point_counts);
7203 auto local_coord_dim = this->coord_dim;
7205 for(mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii) {
7206 mj_lno_t i = host_coordinate_permutations(ii);
7207 mj_part_t pp = host_assigned_part_ids(i);
7208 mj_part_t p = pp / 2;
7211 mj_scalar_t *vals =
new mj_scalar_t[local_coord_dim -1];
7212 allocated_memory.push_back(vals);
7217 if(longest_dim_part) {
7219 for(
int dim = local_coord_dim - 2; dim >= 0; --dim) {
7222 int next_largest_coord_dim = p_coord_dimension_range_sorted[dim].id;
7227 host_mj_coordinates(i,next_largest_coord_dim);
7231 for(
int dim = coordInd + 1; dim < local_coord_dim; ++dim) {
7232 vals[val_ind++] = host_mj_coordinates(i,dim);
7234 for(
int dim = 0; dim < coordInd; ++dim) {
7235 vals[val_ind++] = host_mj_coordinates(i,dim);
7239 multiSItem tempSortItem(i, local_coord_dim -1, vals);
7241 mj_part_t cmap = cut_map[p];
7242 sort_vector_points_on_cut[cmap].push_back(tempSortItem);
7246 ++host_thread_point_counts(p);
7247 host_assigned_part_ids(i) = p;
7252 for(mj_part_t i = 0; i < different_cut_count; ++i) {
7253 std::sort (sort_vector_points_on_cut[i].begin(),
7254 sort_vector_points_on_cut[i].end());
7257 mj_part_t previous_cut_map = cut_map[0];
7259 auto host_thread_cut_line_weight_to_put_left =
7260 Kokkos::create_mirror_view(thread_cut_line_weight_to_put_left);
7261 Kokkos::deep_copy(host_thread_cut_line_weight_to_put_left,
7262 thread_cut_line_weight_to_put_left);
7264 auto host_mj_weights = Kokkos::create_mirror_view(mj_weights);
7265 Kokkos::deep_copy(host_mj_weights, mj_weights);
7275 mj_scalar_t weight_stolen_from_previous_part = 0;
7276 for(mj_part_t p = 0; p < no_cuts; ++p) {
7277 mj_part_t mapped_cut = cut_map[p];
7281 if(previous_cut_map != mapped_cut) {
7282 mj_lno_t sort_vector_end = (mj_lno_t)
7283 sort_vector_points_on_cut[previous_cut_map].size() - 1;
7284 for(; sort_vector_end >= 0; --sort_vector_end) {
7286 sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
7287 mj_lno_t i = t.index;
7288 ++host_thread_point_counts(p);
7289 host_assigned_part_ids(i) = p;
7291 sort_vector_points_on_cut[previous_cut_map].clear();
7295 mj_lno_t sort_vector_end = (mj_lno_t)
7296 sort_vector_points_on_cut[mapped_cut].size() - 1;
7302 for(; sort_vector_end >= 0; --sort_vector_end) {
7305 multiSItem t = sort_vector_points_on_cut[mapped_cut][sort_vector_end];
7307 mj_lno_t i = t.index;
7308 mj_scalar_t w = this->mj_uniform_weights(0) ? 1 :
7309 this->mj_weights(i,0);
7311 if(host_thread_cut_line_weight_to_put_left(p) +
7312 weight_stolen_from_previous_part> this->sEpsilon &&
7313 host_thread_cut_line_weight_to_put_left(p) +
7314 weight_stolen_from_previous_part -
7315 std::abs(host_thread_cut_line_weight_to_put_left(p) +
7316 weight_stolen_from_previous_part - w)> this->sEpsilon)
7318 host_thread_cut_line_weight_to_put_left(p) -= w;
7320 sort_vector_points_on_cut[mapped_cut].pop_back();
7322 ++host_thread_point_counts(p);
7323 host_assigned_part_ids(i) = p;
7327 if(p < no_cuts - 1 &&
7328 host_thread_cut_line_weight_to_put_left(p) < this->sEpsilon) {
7329 if(mapped_cut == cut_map[p + 1] ) {
7332 if(previous_cut_map != mapped_cut) {
7333 weight_stolen_from_previous_part =
7334 host_thread_cut_line_weight_to_put_left(p);
7339 weight_stolen_from_previous_part +=
7340 host_thread_cut_line_weight_to_put_left(p);
7344 weight_stolen_from_previous_part =
7345 -host_thread_cut_line_weight_to_put_left(p);
7354 if(p < no_cuts - 1 && mapped_cut == cut_map[p + 1]) {
7355 if(previous_cut_map != mapped_cut) {
7356 weight_stolen_from_previous_part =
7357 host_thread_cut_line_weight_to_put_left(p);
7360 weight_stolen_from_previous_part +=
7361 host_thread_cut_line_weight_to_put_left(p);
7365 weight_stolen_from_previous_part =
7366 -host_thread_cut_line_weight_to_put_left(p);
7372 previous_cut_map = mapped_cut;
7377 mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[
7378 previous_cut_map].size() - 1;
7384 for(; sort_vector_end >= 0; --sort_vector_end) {
7386 multiSItem t = sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
7389 mj_lno_t i = t.index;
7390 ++host_thread_point_counts(no_cuts);
7391 host_assigned_part_ids(i) = no_cuts;
7394 sort_vector_points_on_cut[previous_cut_map].clear();
7398 mj_lno_t vSize = (mj_lno_t) allocated_memory.size();
7399 for(mj_lno_t i = 0; i < vSize; ++i) {
7400 delete [] allocated_memory[i];
7403 auto local_out_part_xadj = out_part_xadj;
7404 auto host_out_part_xadj = Kokkos::create_mirror_view(local_out_part_xadj);
7405 Kokkos::deep_copy(host_out_part_xadj, out_part_xadj);
7408 for(mj_part_t j = 0; j < num_parts; ++j) {
7409 host_out_part_xadj(j) = host_thread_point_counts(j);
7410 host_thread_point_counts(j) = 0;
7414 for(mj_part_t j = 1; j < num_parts; ++j) {
7415 host_out_part_xadj(j) += host_out_part_xadj(j - 1);
7420 for(mj_part_t j = 1; j < num_parts; ++j) {
7421 host_thread_point_counts(j) += host_out_part_xadj(j - 1);
7424 auto host_new_coordinate_permutations =
7425 Kokkos::create_mirror_view(new_coordinate_permutations);
7426 Kokkos::deep_copy(host_new_coordinate_permutations,
7427 new_coordinate_permutations);
7431 for(mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii) {
7432 mj_lno_t i = host_coordinate_permutations(ii);
7433 mj_part_t p = host_assigned_part_ids(i);
7434 host_new_coordinate_permutations(coordinate_begin +
7435 host_thread_point_counts(p)++) = i;
7438 Kokkos::deep_copy(thread_point_counts, host_thread_point_counts);
7439 Kokkos::deep_copy(new_coordinate_permutations,
7440 host_new_coordinate_permutations);
7441 Kokkos::deep_copy(local_out_part_xadj, host_out_part_xadj);
7453 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
7454 typename mj_part_t,
typename mj_node_t>
7455 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
7457 mj_part_t current_num_parts,
7458 mj_part_t output_part_begin_index,
7459 RCP<mj_partBoxVector_t> &output_part_boxes,
7460 bool is_data_ever_migrated)
7463 mj_timer_base_string +
"Part_Assignment");
7466 auto local_mj_keep_part_boxes = mj_keep_part_boxes;
7467 auto local_coordinate_permutations = coordinate_permutations;
7468 auto local_assigned_part_ids = assigned_part_ids;
7470 if(local_mj_keep_part_boxes) {
7471 for(
int i = 0; i < current_num_parts; ++i) {
7472 (*output_part_boxes)[i].setpId(i + output_part_begin_index);
7476 Kokkos::TeamPolicy<typename mj_node_t::execution_space> policy(
7477 current_num_parts, Kokkos::AUTO());
7478 typedef typename Kokkos::TeamPolicy<typename mj_node_t::execution_space>
::
7480 Kokkos::parallel_for(policy, KOKKOS_LAMBDA(member_type team_member) {
7481 int i = team_member.league_rank();
7482 Kokkos::parallel_for(Kokkos::TeamThreadRange (team_member, (i != 0) ?
7483 local_part_xadj(i-1) : 0, local_part_xadj(i)),
7485 mj_lno_t k = local_coordinate_permutations(ii);
7486 local_assigned_part_ids(k) = i + output_part_begin_index;
7490 if(is_data_ever_migrated) {
7491 #ifdef ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
7492 if(
sizeof(mj_lno_t) <=
sizeof(int)) {
7499 ZOLTAN_COMM_OBJ *plan = NULL;
7500 MPI_Comm mpi_comm = Teuchos::getRawMpiComm(*(this->mj_problemComm));
7503 int message_tag = 7856;
7506 mj_timer_base_string +
"Final Z1PlanCreating");
7509 int ierr = Zoltan_Comm_Create( &plan,
int(this->num_local_coords),
7510 this->owner_of_coordinate.data(), mpi_comm, message_tag, &incoming);
7514 mj_timer_base_string +
"Final Z1PlanCreating" );
7517 mj_timer_base_string +
"Final Z1PlanComm");
7524 auto host_current_mj_gnos = Kokkos::create_mirror_view(
7525 Kokkos::HostSpace(), this->current_mj_gnos);
7526 deep_copy(host_current_mj_gnos, this->current_mj_gnos);
7527 Kokkos::View<mj_gno_t*, device_t> dst_gnos(
7528 Kokkos::ViewAllocateWithoutInitializing(
"dst_gnos"), incoming);
7529 auto host_dst_gnos = Kokkos::create_mirror_view(
7530 Kokkos::HostSpace(), dst_gnos);
7532 ierr = Zoltan_Comm_Do( plan, message_tag,
7533 (
char *) host_current_mj_gnos.data(),
7534 sizeof(mj_gno_t), (
char *) host_dst_gnos.data());
7536 Kokkos::deep_copy(dst_gnos, host_dst_gnos);
7537 this->current_mj_gnos = dst_gnos;
7540 auto host_src_part_ids = Kokkos::create_mirror_view(
7541 Kokkos::HostSpace(), this->assigned_part_ids);
7542 deep_copy(host_src_part_ids, this->assigned_part_ids);
7543 Kokkos::View<mj_part_t*, device_t> dst_part_ids(
7544 Kokkos::ViewAllocateWithoutInitializing(
"dst_part_ids"), incoming);
7545 auto host_dst_part_ids = Kokkos::create_mirror_view(
7546 Kokkos::HostSpace(), dst_part_ids);
7548 ierr = Zoltan_Comm_Do( plan, message_tag,
7549 (
char *) host_src_part_ids.data(),
7550 sizeof(mj_part_t), (
char *) host_dst_part_ids.data());
7552 Kokkos::deep_copy(dst_part_ids, host_dst_part_ids);
7553 this->assigned_part_ids = dst_part_ids;
7555 ierr = Zoltan_Comm_Destroy(&plan);
7558 this->num_local_coords = incoming;
7561 mj_timer_base_string +
"Final Z1PlanComm");
7564 #endif // ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
7568 mj_timer_base_string +
"Final DistributorPlanCreating");
7569 Tpetra::Distributor distributor(this->mj_problemComm);
7570 ArrayView<const mj_part_t> owners_of_coords(
7571 this->owner_of_coordinate.data(), this->num_local_coords);
7572 mj_lno_t incoming = distributor.createFromSends(owners_of_coords);
7574 mj_timer_base_string +
"Final DistributorPlanCreating" );
7577 mj_timer_base_string +
"Final DistributorPlanComm");
7583 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> sent_gnos(
7584 Kokkos::ViewAllocateWithoutInitializing(
"sent_gnos"),
7585 this->current_mj_gnos.extent(0));
7586 Kokkos::deep_copy(sent_gnos, this->current_mj_gnos);
7588 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> received_gnos(
7589 Kokkos::ViewAllocateWithoutInitializing(
"received_gnos"),
7592 distributor.doPostsAndWaits(sent_gnos, 1, received_gnos);
7594 this->current_mj_gnos = Kokkos::View<mj_gno_t*, device_t>(
7595 Kokkos::ViewAllocateWithoutInitializing(
"current_mj_gnos"), incoming);
7597 Kokkos::deep_copy(this->current_mj_gnos, received_gnos);
7600 Kokkos::View<mj_part_t *, Kokkos::HostSpace> sent_partids(
7601 Kokkos::ViewAllocateWithoutInitializing(
"sent_partids"),
7602 this->assigned_part_ids.extent(0));
7603 Kokkos::deep_copy(sent_partids, this->assigned_part_ids);
7605 Kokkos::View<mj_part_t *, Kokkos::HostSpace> received_partids(
7606 Kokkos::ViewAllocateWithoutInitializing(
"received_partids"),
7609 distributor.doPostsAndWaits(sent_partids, 1, received_partids);
7611 this->assigned_part_ids =
7612 Kokkos::View<mj_part_t*, device_t>(
7613 Kokkos::ViewAllocateWithoutInitializing(
"assigned_part_ids"),
7616 Kokkos::deep_copy(this->assigned_part_ids, received_partids);
7617 this->num_local_coords = incoming;
7620 mj_timer_base_string +
"Final DistributorPlanComm");
7625 mj_timer_base_string +
"Part_Assignment");
7628 mj_timer_base_string +
"Solution_Part_Assignment");
7633 if(this->mj_keep_part_boxes) {
7634 this->kept_boxes = compute_global_box_boundaries(output_part_boxes);
7638 mj_timer_base_string +
"Solution_Part_Assignment");
7653 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
7654 typename mj_part_t,
typename mj_node_t>
7657 bool distribute_points_on_cut_lines_,
7658 int max_concurrent_part_calculation_,
7659 int check_migrate_avoid_migration_option_,
7660 double minimum_migration_imbalance_,
7661 int migration_type_)
7663 this->distribute_points_on_cut_lines = distribute_points_on_cut_lines_;
7664 this->max_concurrent_part_calculation = max_concurrent_part_calculation_;
7665 this->check_migrate_avoid_migration_option =
7666 check_migrate_avoid_migration_option_;
7667 this->minimum_migration_imbalance = minimum_migration_imbalance_;
7668 this->migration_type = migration_type_;
7698 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
7699 typename mj_part_t,
typename mj_node_t>
7702 const RCP<const Environment> &env,
7703 RCP<
const Comm<int> > &problemComm,
7704 double imbalance_tolerance_,
7706 size_t num_global_parts_,
7707 Kokkos::View<mj_part_t*, Kokkos::HostSpace> & part_no_array_,
7708 int recursion_depth_,
7710 mj_lno_t num_local_coords_,
7711 mj_gno_t num_global_coords_,
7712 Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos_,
7714 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> & mj_coordinates_,
7715 int num_weights_per_coord_,
7716 Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_weights_,
7717 Kokkos::View<mj_scalar_t**, device_t> & mj_weights_,
7718 Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_parts_,
7719 Kokkos::View<mj_part_t *, device_t> & result_assigned_part_ids_,
7720 Kokkos::View<mj_gno_t*, device_t> & result_mj_gnos_)
7725 this->mj_timer_base_string =
"MJ(" + std::to_string(execute_counter) +
") - ";
7728 this->mj_problemComm = problemComm;
7729 this->myActualRank = this->myRank = this->mj_problemComm->getRank();
7731 mj_timer_base_string +
"Total");
7732 this->mj_env->debug(3,
"In MultiJagged Jagged");
7733 this->imbalance_tolerance = imbalance_tolerance_;
7734 this->mj_num_teams = num_teams_;
7735 this->num_global_parts = num_global_parts_;
7736 this->part_no_array = part_no_array_;
7737 this->recursion_depth = recursion_depth_;
7738 this->coord_dim = coord_dim_;
7739 this->num_local_coords = num_local_coords_;
7740 this->num_global_coords = num_global_coords_;
7741 this->mj_coordinates = mj_coordinates_;
7742 this->initial_mj_gnos = initial_mj_gnos_;
7743 this->num_weights_per_coord = num_weights_per_coord_;
7744 this->mj_uniform_weights = mj_uniform_weights_;
7745 this->mj_weights = mj_weights_;
7746 this->mj_uniform_parts = mj_uniform_parts_;
7750 this->set_part_specifications();
7753 mj_timer_base_string +
"Allocate Views");
7754 this->allocate_set_work_memory();
7756 mj_timer_base_string +
"Allocate Views");
7760 this->comm = this->mj_problemComm->duplicate();
7763 if(comm->getRank() == 0) {
7764 std::cout <<
"size of gno:" <<
sizeof(mj_gno_t) << std::endl;
7765 std::cout <<
"size of lno:" <<
sizeof(mj_lno_t) << std::endl;
7766 std::cout <<
"size of mj_scalar_t:" <<
sizeof(mj_scalar_t) << std::endl;
7771 mj_part_t current_num_parts = 1;
7772 Kokkos::View<mj_scalar_t *, device_t> current_cut_coordinates =
7773 this->all_cut_coordinates;
7775 mj_timer_base_string +
"Problem_Partitioning");
7776 mj_part_t output_part_begin_index = 0;
7777 mj_part_t future_num_parts = this->total_num_part;
7778 bool is_data_ever_migrated =
false;
7780 std::vector<mj_part_t> *future_num_part_in_parts =
7781 new std::vector<mj_part_t> ();
7782 std::vector<mj_part_t> *next_future_num_parts_in_parts =
7783 new std::vector<mj_part_t> ();
7785 next_future_num_parts_in_parts->push_back(this->num_global_parts);
7787 RCP<mj_partBoxVector_t> input_part_boxes;
7788 RCP<mj_partBoxVector_t> output_part_boxes;
7790 if(this->mj_keep_part_boxes) {
7791 input_part_boxes = RCP<mj_partBoxVector_t>(
new mj_partBoxVector_t(),
true);
7792 output_part_boxes = RCP<mj_partBoxVector_t>(
new mj_partBoxVector_t(),
true);
7793 compute_global_box();
7794 this->init_part_boxes(output_part_boxes);
7801 Kokkos::View<mj_part_t*, device_t>
7802 view_rectilinear_cut_count(
"view_rectilinear_cut_count", 1);
7803 Kokkos::View<size_t*, device_t>
7804 view_total_reduction_size(
"view_total_reduction_size", 1);
7806 for(
int i = 0; i < this->recursion_depth; ++i) {
7809 std::string istring = std::to_string(i);
7815 std::vector<mj_part_t> *tmpPartVect= future_num_part_in_parts;
7816 future_num_part_in_parts = next_future_num_parts_in_parts;
7817 next_future_num_parts_in_parts = tmpPartVect;
7821 next_future_num_parts_in_parts->clear();
7822 if(this->mj_keep_part_boxes) {
7823 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
7824 input_part_boxes = output_part_boxes;
7825 output_part_boxes = tmpPartBoxes;
7826 output_part_boxes->clear();
7830 mj_part_t output_part_count_in_dimension =
7831 this->update_part_num_arrays(
7832 future_num_part_in_parts,
7833 next_future_num_parts_in_parts,
7838 output_part_boxes, 1);
7843 if(output_part_count_in_dimension == current_num_parts) {
7845 tmpPartVect= future_num_part_in_parts;
7846 future_num_part_in_parts = next_future_num_parts_in_parts;
7847 next_future_num_parts_in_parts = tmpPartVect;
7849 if(this->mj_keep_part_boxes) {
7850 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
7851 input_part_boxes = output_part_boxes;
7852 output_part_boxes = tmpPartBoxes;
7858 int coordInd = i % this->coord_dim;
7860 Kokkos::View<mj_scalar_t *, device_t> mj_current_dim_coords =
7861 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
7864 mj_timer_base_string +
"Problem_Partitioning_" + istring);
7868 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
7869 "new part xadj", output_part_count_in_dimension);
7872 mj_part_t output_part_index = 0;
7876 mj_part_t output_coordinate_end_index = 0;
7878 mj_part_t current_work_part = 0;
7879 mj_part_t current_concurrent_num_parts =
7880 std::min(current_num_parts - current_work_part,
7881 this->max_concurrent_part_calculation);
7883 mj_part_t obtained_part_index = 0;
7885 auto host_process_local_min_max_coord_total_weight =
7886 Kokkos::create_mirror_view(process_local_min_max_coord_total_weight);
7887 auto host_global_min_max_coord_total_weight =
7888 Kokkos::create_mirror_view(global_min_max_coord_total_weight);
7891 for(; current_work_part < current_num_parts;
7894 current_concurrent_num_parts =
7895 std::min(current_num_parts - current_work_part,
7896 this->max_concurrent_part_calculation);
7899 auto local_device_num_partitioning_in_current_dim =
7900 device_num_partitioning_in_current_dim;
7901 Kokkos::parallel_reduce(
"Read bDoingWork",
7902 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
7903 KOKKOS_LAMBDA(
int dummy,
int & set_single) {
7906 if(local_device_num_partitioning_in_current_dim(
7907 current_work_part + kk) != 1) {
7913 bool bDoingWork = (bDoingWork_int != 0) ?
true :
false;
7915 this->mj_get_local_min_max_coord_totW(
7917 current_concurrent_num_parts,
7918 mj_current_dim_coords);
7923 this->mj_get_global_min_max_coord_totW(
7924 current_concurrent_num_parts,
7925 this->process_local_min_max_coord_total_weight,
7926 this->global_min_max_coord_total_weight);
7930 mj_part_t total_incomplete_cut_count = 0;
7935 mj_part_t concurrent_part_cut_shift = 0;
7936 mj_part_t concurrent_part_part_shift = 0;
7940 Kokkos::deep_copy(host_global_min_max_coord_total_weight,
7941 global_min_max_coord_total_weight);
7943 mj_scalar_t min_coordinate =
7944 host_global_min_max_coord_total_weight(kk);
7945 mj_scalar_t max_coordinate =
7946 host_global_min_max_coord_total_weight(
7947 kk + current_concurrent_num_parts);
7949 mj_scalar_t global_total_weight =
7950 host_global_min_max_coord_total_weight(
7951 kk + 2 * current_concurrent_num_parts);
7953 mj_part_t concurrent_current_part_index = current_work_part + kk;
7955 mj_part_t partition_count = host_num_partitioning_in_current_dim(
7956 concurrent_current_part_index);
7958 Kokkos::View<mj_scalar_t *, device_t> usedCutCoordinate =
7959 Kokkos::subview(current_cut_coordinates,
7960 std::pair<mj_lno_t, mj_lno_t>(
7961 concurrent_part_cut_shift, current_cut_coordinates.size()));
7962 Kokkos::View<mj_scalar_t *, device_t>
7963 current_target_part_weights =
7964 Kokkos::subview(target_part_weights,
7965 std::pair<mj_lno_t, mj_lno_t>(
7966 concurrent_part_part_shift, target_part_weights.size()));
7969 concurrent_part_cut_shift += partition_count - 1;
7971 concurrent_part_part_shift += partition_count;
7975 if(partition_count > 1 && min_coordinate <= max_coordinate) {
7979 total_incomplete_cut_count += partition_count - 1;
7981 this->incomplete_cut_count(kk) = partition_count - 1;
7984 this->mj_get_initial_cut_coords_target_weights(
7987 partition_count - 1,
7988 global_total_weight,
7990 current_target_part_weights,
7991 future_num_part_in_parts,
7992 next_future_num_parts_in_parts,
7993 concurrent_current_part_index,
7994 obtained_part_index);
7996 mj_lno_t coordinate_end_index =
7997 host_part_xadj(concurrent_current_part_index);
7998 mj_lno_t coordinate_begin_index =
7999 concurrent_current_part_index==0 ? 0 :
8000 host_part_xadj(concurrent_current_part_index - 1);
8002 this->set_initial_coordinate_parts(
8005 coordinate_begin_index, coordinate_end_index,
8006 this->coordinate_permutations,
8007 mj_current_dim_coords,
8008 this->assigned_part_ids,
8014 this->incomplete_cut_count(kk) = 0;
8017 obtained_part_index += partition_count;
8022 double used_imbalance = 0;
8025 mj_timer_base_string +
"Problem_Partitioning Get Part Weights");
8028 mj_current_dim_coords,
8031 current_concurrent_num_parts,
8032 current_cut_coordinates,
8033 total_incomplete_cut_count,
8034 view_rectilinear_cut_count,
8035 view_total_reduction_size);
8038 mj_timer_base_string +
"Problem_Partitioning Get Part Weights");
8043 mj_part_t output_array_shift = 0;
8044 mj_part_t cut_shift = 0;
8045 size_t tlr_shift = 0;
8046 size_t partweight_array_shift = 0;
8049 mj_part_t current_concurrent_work_part = current_work_part + kk;
8051 mj_part_t num_parts = host_num_partitioning_in_current_dim(
8052 current_concurrent_work_part);
8055 int coordinateA_bigger_than_coordinateB =
8056 host_global_min_max_coord_total_weight(kk) >
8057 host_global_min_max_coord_total_weight(
8058 kk + current_concurrent_num_parts);
8060 if((num_parts != 1) && coordinateA_bigger_than_coordinateB) {
8063 auto local_new_part_xadj = this->new_part_xadj;
8064 Kokkos::parallel_for(
8065 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
8066 (0, num_parts), KOKKOS_LAMBDA (mj_part_t jj) {
8067 local_new_part_xadj(
8068 output_part_index + output_array_shift + jj) = 0;
8071 cut_shift += num_parts - 1;
8072 tlr_shift += (4 *(num_parts - 1) + 1);
8073 output_array_shift += num_parts;
8074 partweight_array_shift += (2 * (num_parts - 1) + 1);
8078 Kokkos::View<mj_scalar_t *, device_t>
8079 current_concurrent_cut_coordinate =
8080 Kokkos::subview(current_cut_coordinates,
8081 std::pair<mj_lno_t, mj_lno_t>(
8083 current_cut_coordinates.size()));
8084 Kokkos::View<mj_scalar_t *, device_t>
8085 used_local_cut_line_weight_to_left =
8086 Kokkos::subview(process_cut_line_weight_to_put_left,
8087 std::pair<mj_lno_t, mj_lno_t>(
8089 process_cut_line_weight_to_put_left.size()));
8091 this->thread_part_weight_work =
8093 this->thread_part_weights,
8094 std::pair<mj_lno_t, mj_lno_t>(
8095 partweight_array_shift,
8096 this->thread_part_weights.extent(0)));
8099 if(this->mj_keep_part_boxes) {
8101 for(mj_part_t j = 0; j < num_parts - 1; ++j) {
8102 mj_scalar_t temp_get_val;
8103 Kokkos::parallel_reduce(
"Read single",
8104 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
8105 KOKKOS_LAMBDA(
int dummy, mj_scalar_t & set_single) {
8106 set_single = current_concurrent_cut_coordinate(j);
8108 (*output_part_boxes)
8109 [output_array_shift + output_part_index + j].
8110 updateMinMax(temp_get_val, 1 , coordInd);
8111 (*output_part_boxes)
8112 [output_array_shift + output_part_index + j + 1].
8113 updateMinMax(temp_get_val, 0 , coordInd);
8118 Kokkos::View<mj_lno_t*, device_t> sub_new_part_xadj =
8119 Kokkos::subview(this->new_part_xadj,
8120 std::pair<mj_lno_t, mj_lno_t>(
8121 output_part_index + output_array_shift,
8122 this->new_part_xadj.size()));
8124 this->mj_create_new_partitions(
8126 current_concurrent_work_part,
8127 mj_current_dim_coords,
8128 current_concurrent_cut_coordinate,
8129 used_local_cut_line_weight_to_left,
8134 mj_lno_t coordinate_end = host_part_xadj(
8135 current_concurrent_work_part);
8136 mj_lno_t coordinate_begin =
8137 current_concurrent_work_part==0 ? 0 : host_part_xadj(
8138 current_concurrent_work_part - 1);
8142 mj_lno_t part_size = coordinate_end - coordinate_begin;
8146 auto local_new_part_xadj = this->new_part_xadj;
8147 Kokkos::parallel_for(
8148 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
8149 (0, 1), KOKKOS_LAMBDA (
int dummy) {
8150 local_new_part_xadj(
8151 output_part_index + output_array_shift) = part_size;
8154 auto subview_new_coordinate_permutations =
8155 Kokkos::subview(this->new_coordinate_permutations,
8156 std::pair<mj_lno_t, mj_lno_t>(
8158 coordinate_begin + part_size));
8159 auto subview_coordinate_permutations =
8160 Kokkos::subview(this->coordinate_permutations,
8161 std::pair<mj_lno_t, mj_lno_t>(
8163 coordinate_begin + part_size));
8164 Kokkos::deep_copy(subview_new_coordinate_permutations,
8165 subview_coordinate_permutations);
8167 cut_shift += num_parts - 1;
8168 output_array_shift += num_parts;
8169 partweight_array_shift += (2 * (num_parts - 1) + 1);
8179 mj_part_t num_parts =
8180 host_num_partitioning_in_current_dim(current_work_part + kk);
8184 auto local_new_part_xadj = this->new_part_xadj;
8185 Kokkos::parallel_for(
8186 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
8187 (0, num_parts), KOKKOS_LAMBDA (mj_part_t ii) {
8188 local_new_part_xadj(output_part_index+ii) +=
8189 output_coordinate_end_index;
8194 Kokkos::parallel_reduce(
"Read single",
8195 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
8196 KOKKOS_LAMBDA(
int dummy, mj_part_t & set_single) {
8198 local_new_part_xadj(output_part_index + num_parts - 1);
8200 output_coordinate_end_index = temp_get;
8202 output_part_index += num_parts;
8208 int current_world_size = this->comm->getSize();
8209 long migration_reduce_all_population =
8210 this->total_dim_num_reduce_all * current_world_size;
8211 bool is_migrated_in_current_dimension =
false;
8216 if(future_num_parts > 1 &&
8217 this->check_migrate_avoid_migration_option >= 0 &&
8218 current_world_size > 1) {
8220 mj_timer_base_string +
"Problem_Migration-" + istring);
8221 mj_part_t num_parts = output_part_count_in_dimension;
8223 if(this->mj_perform_migration(
8226 next_future_num_parts_in_parts,
8227 output_part_begin_index,
8228 migration_reduce_all_population,
8229 this->num_global_coords / (future_num_parts * current_num_parts),
8231 input_part_boxes, output_part_boxes) )
8233 is_migrated_in_current_dimension =
true;
8234 is_data_ever_migrated =
true;
8236 mj_timer_base_string +
"Problem_Migration-" + istring);
8239 this->total_dim_num_reduce_all /= num_parts;
8242 is_migrated_in_current_dimension =
false;
8244 mj_timer_base_string +
"Problem_Migration-" + istring);
8249 Kokkos::View<mj_lno_t*, device_t> tmp =
8250 this->coordinate_permutations;
8251 this->coordinate_permutations =
8252 this->new_coordinate_permutations;
8254 this->new_coordinate_permutations = tmp;
8255 if(!is_migrated_in_current_dimension) {
8256 this->total_dim_num_reduce_all -= current_num_parts;
8257 current_num_parts = output_part_count_in_dimension;
8262 local_part_xadj = this->new_part_xadj;
8263 this->host_part_xadj = Kokkos::create_mirror_view(
part_xadj);
8264 Kokkos::deep_copy(host_part_xadj,
part_xadj);
8266 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
"empty", 0);
8268 mj_timer_base_string +
"Problem_Partitioning_" + istring);
8273 delete future_num_part_in_parts;
8274 delete next_future_num_parts_in_parts;
8276 mj_timer_base_string +
"Problem_Partitioning");
8282 this->set_final_parts(
8284 output_part_begin_index,
8286 is_data_ever_migrated);
8288 result_assigned_part_ids_ = this->assigned_part_ids;
8289 result_mj_gnos_ = this->current_mj_gnos;
8291 mj_timer_base_string +
"Total");
8292 this->mj_env->debug(3,
"Out of MultiJagged");
8295 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
8296 typename mj_part_t,
typename mj_node_t>
8297 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t, mj_node_t>::
8302 if(this->mj_keep_part_boxes) {
8303 return this->kept_boxes;
8306 throw std::logic_error(
"Error: part boxes are not stored.");
8310 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
8311 typename mj_part_t,
typename mj_node_t>
8312 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t, mj_node_t>::
8318 mj_part_t ntasks = this->num_global_parts;
8319 int dim = (*localPartBoxes)[0].getDim();
8320 coord_t *localPartBoundaries =
new coord_t[ntasks * 2 *dim];
8322 memset(localPartBoundaries, 0,
sizeof(coord_t) * ntasks * 2 *dim);
8324 coord_t *globalPartBoundaries =
new coord_t[ntasks * 2 *dim];
8325 memset(globalPartBoundaries, 0,
sizeof(coord_t) * ntasks * 2 *dim);
8327 coord_t *localPartMins = localPartBoundaries;
8328 coord_t *localPartMaxs = localPartBoundaries + ntasks * dim;
8330 coord_t *globalPartMins = globalPartBoundaries;
8331 coord_t *globalPartMaxs = globalPartBoundaries + ntasks * dim;
8333 mj_part_t boxCount = localPartBoxes->size();
8334 for(mj_part_t i = 0; i < boxCount; ++i) {
8335 mj_part_t pId = (*localPartBoxes)[i].getpId();
8339 coord_t *lmins = (*localPartBoxes)[i].getlmins();
8340 coord_t *lmaxs = (*localPartBoxes)[i].getlmaxs();
8342 for(
int j = 0; j < dim; ++j) {
8343 localPartMins[dim * pId + j] = lmins[j];
8344 localPartMaxs[dim * pId + j] = lmaxs[j];
8357 reduceAll<int, coord_t>(*mj_problemComm, reductionOp,
8358 ntasks * 2 *dim, localPartBoundaries, globalPartBoundaries);
8360 RCP<mj_partBoxVector_t> pB(
new mj_partBoxVector_t(),
true);
8361 for(mj_part_t i = 0; i < ntasks; ++i) {
8363 globalPartMins + dim * i,
8364 globalPartMaxs + dim * i);
8377 delete []localPartBoundaries;
8378 delete []globalPartBoundaries;
8385 template <
typename Adapter>
8391 #ifndef DOXYGEN_SHOULD_SKIP_THIS
8395 typedef typename Adapter::scalar_t adapter_scalar_t;
8398 typedef float default_mj_scalar_t;
8404 (std::is_same<adapter_scalar_t, float>::value ||
8405 std::is_same<adapter_scalar_t, double>::value),
8406 adapter_scalar_t, default_mj_scalar_t>::type mj_scalar_t;
8413 typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
8414 typedef typename mj_node_t::device_type
device_t;
8419 RCP<const Environment> mj_env;
8420 RCP<const Comm<int> > mj_problemComm;
8421 RCP<const typename Adapter::base_adapter_t> mj_adapter;
8424 double imbalance_tolerance;
8428 size_t num_global_parts;
8431 Kokkos::View<mj_part_t*, Kokkos::HostSpace> part_no_array;
8434 int recursion_depth;
8437 mj_lno_t num_local_coords;
8438 mj_gno_t num_global_coords;
8441 Kokkos::View<const mj_gno_t*, device_t> initial_mj_gnos;
8445 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
8448 int num_weights_per_coord;
8451 Kokkos::View<bool*, Kokkos::HostSpace> mj_uniform_weights;
8454 Kokkos::View<mj_scalar_t**, device_t> mj_weights;
8457 Kokkos::View<bool*, Kokkos::HostSpace> mj_uniform_parts;
8467 mj_part_t num_first_level_parts;
8471 Kokkos::View<mj_part_t*, Kokkos::HostSpace> first_level_distribution;
8475 bool distribute_points_on_cut_lines;
8478 mj_part_t max_concurrent_part_calculation;
8481 int check_migrate_avoid_migration_option;
8489 double minimum_migration_imbalance;
8490 bool mj_keep_part_boxes;
8494 int mj_premigration_option;
8495 int min_coord_per_rank_for_premigration;
8498 ArrayRCP<mj_part_t> comXAdj_;
8501 ArrayRCP<mj_part_t> comAdj_;
8506 void set_input_parameters(
const Teuchos::ParameterList &p);
8508 RCP<mj_partBoxVector_t> getGlobalBoxBoundaries()
const;
8510 bool mj_premigrate_to_subset(
8512 int migration_selection_option,
8513 RCP<const Environment> mj_env_,
8514 RCP<
const Comm<int> > mj_problemComm_,
8516 mj_lno_t num_local_coords_,
8517 mj_gno_t num_global_coords_,
size_t num_global_parts_,
8518 Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos_,
8520 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> &
8522 int num_weights_per_coord_,
8523 Kokkos::View<mj_scalar_t**, device_t> & mj_weights_,
8525 RCP<
const Comm<int> > &result_problemComm_,
8526 mj_lno_t & result_num_local_coords_,
8527 Kokkos::View<mj_gno_t*, device_t> & result_initial_mj_gnos_,
8529 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> &
8530 result_mj_coordinates_,
8531 Kokkos::View<mj_scalar_t**, device_t> & result_mj_weights_,
8532 int * &result_actual_owner_rank_);
8537 RCP<
const Comm<int> > &problemComm,
8538 const RCP<const typename Adapter::base_adapter_t> &adapter) :
8541 mj_problemComm(problemComm),
8542 mj_adapter(adapter),
8543 imbalance_tolerance(0),
8545 num_global_parts(1),
8548 num_local_coords(0),
8549 num_global_coords(0),
8550 num_weights_per_coord(0),
8551 num_first_level_parts(1),
8552 distribute_points_on_cut_lines(true),
8553 max_concurrent_part_calculation(1),
8554 check_migrate_avoid_migration_option(0),
8556 minimum_migration_imbalance(0.30),
8557 mj_keep_part_boxes(false),
8558 mj_run_as_rcb(false),
8559 mj_premigration_option(0),
8560 min_coord_per_rank_for_premigration(32000),
8574 const bool bUnsorted =
true;
8575 RCP<Zoltan2::IntegerRangeListValidator<int>> mj_parts_Validator =
8577 pl.set(
"mj_parts",
"0",
"list of parts for multiJagged partitioning "
8578 "algorithm. As many as the dimension count.", mj_parts_Validator);
8580 pl.set(
"mj_concurrent_part_count", 1,
"The number of parts whose cut "
8581 "coordinates will be calculated concurently.",
8584 pl.set(
"mj_minimum_migration_imbalance", 1.1,
8585 "mj_minimum_migration_imbalance, the minimum imbalance of the "
8586 "processors to avoid migration",
8589 RCP<Teuchos::EnhancedNumberValidator<int>> mj_migration_option_validator =
8590 Teuchos::rcp(
new Teuchos::EnhancedNumberValidator<int>(0, 2) );
8591 pl.set(
"mj_migration_option", 1,
"Migration option, 0 for decision "
8592 "depending on the imbalance, 1 for forcing migration, 2 for "
8593 "avoiding migration", mj_migration_option_validator);
8595 RCP<Teuchos::EnhancedNumberValidator<int>> mj_migration_type_validator =
8596 Teuchos::rcp(
new Teuchos::EnhancedNumberValidator<int>(0, 1) );
8597 pl.set(
"mj_migration_type", 0,
8598 "Migration type, 0 for migration to minimize the imbalance "
8599 "1 for migration to minimize messages exchanged the migration.",
8600 mj_migration_option_validator);
8603 pl.set(
"mj_keep_part_boxes",
false,
"Keep the part boundaries of the "
8607 pl.set(
"mj_enable_rcb",
false,
"Use MJ as RCB.",
8610 pl.set(
"mj_recursion_depth", -1,
"Recursion depth for MJ: Must be "
8613 RCP<Teuchos::EnhancedNumberValidator<int>>
8614 mj_num_teams_validator =
8615 Teuchos::rcp(
new Teuchos::EnhancedNumberValidator<int>(
8616 0, Teuchos::EnhancedNumberTraits<int>::max()) );
8617 pl.set(
"mj_num_teams", 0,
8618 "How many teams for the main kernel loop"
8619 , mj_num_teams_validator);
8621 RCP<Teuchos::EnhancedNumberValidator<int>>
8622 mj_premigration_option_validator =
8623 Teuchos::rcp(
new Teuchos::EnhancedNumberValidator<int>(0, 1024) );
8625 pl.set(
"mj_premigration_option", 0,
8626 "Whether to do premigration or not. 0 for no migration "
8627 "x > 0 for migration to consecutive processors, "
8628 "the subset will be 0,x,2x,3x,...subset ranks."
8629 , mj_premigration_option_validator);
8631 pl.set(
"mj_premigration_coordinate_count", 32000,
"How many coordinate to "
8632 "assign each rank in multijagged after premigration"
8645 RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
8649 mj_part_t pointAssign(
int dim, adapter_scalar_t *point)
const;
8651 void boxAssign(
int dim, adapter_scalar_t *lower, adapter_scalar_t *upper,
8652 size_t &nPartsFound, mj_part_t **partsFound)
const;
8656 void getCommunicationGraph(
8658 ArrayRCP<mj_part_t> &comXAdj,
8659 ArrayRCP<mj_part_t> &comAdj);
8661 void set_up_partitioning_data(
8665 std::string timer_base_string;
8673 template<
class dst_t,
class src_t>
8674 typename std::enable_if<std::is_same<
typename dst_t::value_type,
8675 typename src_t::value_type>::value>::type
8676 assign_if_same(dst_t & dst,
const src_t & src) {
8679 template<
class dst_t,
class src_t>
8680 typename std::enable_if<!std::is_same<
typename dst_t::value_type,
8681 typename src_t::value_type>::value>::type
8682 assign_if_same(dst_t & dst,
const src_t & src) {
8687 template <
typename Adapter>
8688 bool Zoltan2_AlgMJ<Adapter>::mj_premigrate_to_subset(
8690 int migration_selection_option,
8691 RCP<const Environment> mj_env_,
8692 RCP<
const Comm<int> > mj_problemComm_,
8694 mj_lno_t num_local_coords_,
8695 mj_gno_t num_global_coords_,
size_t num_global_parts_,
8696 Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos_,
8698 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> & mj_coordinates_,
8699 int num_weights_per_coord_,
8700 Kokkos::View<mj_scalar_t**, device_t> & mj_weights_,
8702 RCP<
const Comm<int> > & result_problemComm_,
8703 mj_lno_t &result_num_local_coords_,
8704 Kokkos::View<mj_gno_t*, device_t> & result_initial_mj_gnos_,
8706 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> &
8707 result_mj_coordinates_,
8708 Kokkos::View<mj_scalar_t**, device_t> & result_mj_weights_,
8709 int * &result_actual_owner_rank_)
8712 timer_base_string +
"PreMigration DistributorPlanCreating");
8714 int myRank = mj_problemComm_->getRank();
8715 int worldSize = mj_problemComm_->getSize();
8717 mj_part_t groupsize = worldSize / used_num_ranks;
8719 std::vector<mj_part_t> group_begins(used_num_ranks + 1, 0);
8721 mj_part_t i_am_sending_to = 0;
8722 bool am_i_a_receiver =
false;
8724 for(
int i = 0; i < used_num_ranks; ++i) {
8725 group_begins[i+ 1] = group_begins[i] + groupsize;
8726 if(worldSize % used_num_ranks > i) group_begins[i+ 1] += 1;
8727 if(i == used_num_ranks) group_begins[i+ 1] = worldSize;
8728 if(myRank >= group_begins[i] && myRank < group_begins[i + 1]) {
8729 i_am_sending_to = group_begins[i];
8731 if(myRank == group_begins[i]) {
8732 am_i_a_receiver =
true;
8736 ArrayView<const mj_part_t> idView(&(group_begins[0]), used_num_ranks );
8737 result_problemComm_ = mj_problemComm_->createSubcommunicator(idView);
8739 Tpetra::Distributor distributor(mj_problemComm_);
8741 std::vector<mj_part_t>
8742 coordinate_destinations(num_local_coords_, i_am_sending_to);
8744 ArrayView<const mj_part_t>
8745 destinations(&(coordinate_destinations[0]), num_local_coords_);
8746 mj_lno_t num_incoming_gnos = distributor.createFromSends(destinations);
8747 result_num_local_coords_ = num_incoming_gnos;
8749 timer_base_string +
"PreMigration DistributorPlanCreating");
8752 timer_base_string +
"PreMigration DistributorMigration");
8760 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> sent_gnos(
8761 Kokkos::ViewAllocateWithoutInitializing(
"sent_gnos"),
8762 initial_mj_gnos_.size());
8763 Kokkos::deep_copy(sent_gnos, initial_mj_gnos_);
8765 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> received_gnos (
8766 Kokkos::ViewAllocateWithoutInitializing(
"received_gnos"),
8769 distributor.doPostsAndWaits(sent_gnos, 1, received_gnos);
8771 result_initial_mj_gnos_ = Kokkos::View<mj_gno_t*, device_t>(
8772 Kokkos::ViewAllocateWithoutInitializing(
"result_initial_mj_gnos_"),
8774 Kokkos::deep_copy(result_initial_mj_gnos_, received_gnos);
8780 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, Kokkos::HostSpace>
8781 host_src_coordinates(
8782 Kokkos::ViewAllocateWithoutInitializing(
"mj_coordinates"),
8783 this->mj_coordinates.extent(0), this->mj_coordinates.extent(1));
8785 Kokkos::deep_copy(host_src_coordinates, this->mj_coordinates);
8787 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> dst_coordinates(
8788 Kokkos::ViewAllocateWithoutInitializing(
"mj_coordinates"),
8789 num_incoming_gnos, this->coord_dim);
8791 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> received_coord(
8792 Kokkos::ViewAllocateWithoutInitializing(
"received_coord"),
8795 for(
int i = 0; i < this->coord_dim; ++i) {
8797 auto sent_coord = Kokkos::subview(host_src_coordinates, Kokkos::ALL, i);
8799 distributor.doPostsAndWaits(sent_coord, 1, received_coord);
8801 Kokkos::deep_copy(Kokkos::subview(dst_coordinates, Kokkos::ALL, i),
8805 result_mj_coordinates_ = dst_coordinates;
8809 Kokkos::View<mj_scalar_t**, device_t> dst_weights(
8810 Kokkos::ViewAllocateWithoutInitializing(
"mj_weights"),
8811 num_incoming_gnos, this->num_weights_per_coord);
8812 auto host_dst_weights = Kokkos::create_mirror_view(dst_weights);
8814 auto host_src_weights = Kokkos::create_mirror_view_and_copy(
8815 Kokkos::HostSpace(), this->mj_weights);
8818 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> sent_weight(
8819 Kokkos::ViewAllocateWithoutInitializing(
"send_weight_buffer"),
8820 this->num_local_coords);
8822 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> received_weight(
8823 Kokkos::ViewAllocateWithoutInitializing(
"received_weight_buffer"),
8826 for(
int i = 0; i < this->num_weights_per_coord; ++i) {
8828 auto sub_host_src_weights
8829 = Kokkos::subview(host_src_weights, Kokkos::ALL, i);
8830 auto sub_host_dst_weights
8831 = Kokkos::subview(host_dst_weights, Kokkos::ALL, i);
8839 for(mj_lno_t n = 0; n < this->num_local_coords; ++n) {
8840 sent_weight[n] = sub_host_src_weights(n);
8843 distributor.doPostsAndWaits(sent_weight, 1, received_weight);
8846 for(mj_lno_t n = 0; n < num_incoming_gnos; ++n) {
8847 sub_host_dst_weights(n) = received_weight[n];
8850 Kokkos::deep_copy(dst_weights, host_dst_weights);
8851 result_mj_weights_ = dst_weights;
8855 Kokkos::View<int*, Kokkos::HostSpace> sent_owners(
8856 Kokkos::ViewAllocateWithoutInitializing(
"sent_owners"),
8858 Kokkos::deep_copy(sent_owners, myRank);
8860 Kokkos::View<int*, Kokkos::HostSpace> received_owners(
8861 Kokkos::ViewAllocateWithoutInitializing(
"received_owners"),
8864 distributor.doPostsAndWaits(sent_owners, 1, received_owners);
8866 result_actual_owner_rank_ =
new int[num_incoming_gnos];
8868 result_actual_owner_rank_,
8869 received_owners.data(),
8870 num_incoming_gnos *
sizeof(int));
8874 timer_base_string +
"PreMigration DistributorMigration");
8875 return am_i_a_receiver;
8885 template <
typename Adapter>
8894 int execute_counter =
8896 timer_base_string =
"partition(" + std::to_string(execute_counter) +
") - ";
8898 this->mj_env->timerStart(
MACRO_TIMERS, timer_base_string +
"all");
8900 this->mj_env->timerStart(
MACRO_TIMERS, timer_base_string +
"setup");
8902 this->set_up_partitioning_data(solution);
8904 this->set_input_parameters(this->mj_env->getParameters());
8905 if(this->mj_keep_part_boxes) {
8906 this->mj_partitioner.set_to_keep_part_boxes();
8909 this->mj_partitioner.set_partitioning_parameters(
8910 this->distribute_points_on_cut_lines,
8911 this->max_concurrent_part_calculation,
8912 this->check_migrate_avoid_migration_option,
8913 this->minimum_migration_imbalance, this->migration_type);
8915 RCP<const Comm<int> > result_problemComm = this->mj_problemComm;
8916 mj_lno_t result_num_local_coords = this->num_local_coords;
8917 Kokkos::View<mj_gno_t*, device_t> result_initial_mj_gnos;
8919 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
8920 result_mj_coordinates = this->mj_coordinates;
8921 Kokkos::View<mj_scalar_t**, device_t> result_mj_weights =
8923 int *result_actual_owner_rank = NULL;
8925 Kokkos::View<const mj_gno_t*, device_t> result_initial_mj_gnos_ =
8926 this->initial_mj_gnos;
8944 int current_world_size = this->mj_problemComm->getSize();
8945 mj_lno_t threshold_num_local_coords =
8946 this->min_coord_per_rank_for_premigration;
8947 bool is_pre_migrated =
false;
8948 bool am_i_in_subset =
true;
8954 if(mj_premigration_option > 0 &&
8955 size_t (current_world_size) > this->num_global_parts &&
8956 this->num_global_coords < mj_gno_t (
8957 current_world_size * threshold_num_local_coords))
8959 if(this->mj_keep_part_boxes) {
8960 throw std::logic_error(
"Multijagged: mj_keep_part_boxes and "
8961 "mj_premigration_option are not supported together yet.");
8964 is_pre_migrated =
true;
8965 int migration_selection_option = mj_premigration_option;
8966 if(migration_selection_option * this->num_global_parts >
8967 (
size_t) (current_world_size)) {
8968 migration_selection_option =
8969 current_world_size / this->num_global_parts;
8972 int used_num_ranks = int (this->num_global_coords /
8973 float (threshold_num_local_coords) + 0.5);
8975 if(used_num_ranks == 0) {
8979 am_i_in_subset = this->mj_premigrate_to_subset(
8981 migration_selection_option,
8983 this->mj_problemComm,
8985 this->num_local_coords,
8986 this->num_global_coords,
8987 this->num_global_parts,
8988 this->initial_mj_gnos,
8989 this->mj_coordinates,
8990 this->num_weights_per_coord,
8994 result_num_local_coords,
8995 result_initial_mj_gnos,
8996 result_mj_coordinates,
8998 result_actual_owner_rank);
9000 result_initial_mj_gnos_ = result_initial_mj_gnos;
9003 Kokkos::View<mj_part_t *, device_t> result_assigned_part_ids;
9004 Kokkos::View<mj_gno_t*, device_t> result_mj_gnos;
9006 this->mj_env->timerStop(
MACRO_TIMERS, timer_base_string +
"setup");
9008 if(am_i_in_subset) {
9009 this->mj_partitioner.multi_jagged_part(
9012 this->imbalance_tolerance,
9014 this->num_global_parts,
9015 this->part_no_array,
9016 this->recursion_depth,
9018 result_num_local_coords,
9019 this->num_global_coords,
9020 result_initial_mj_gnos_,
9021 result_mj_coordinates,
9022 this->num_weights_per_coord,
9023 this->mj_uniform_weights,
9025 this->mj_uniform_parts,
9026 result_assigned_part_ids,
9031 this->mj_env->timerStart(
MACRO_TIMERS, timer_base_string +
"cleanup");
9034 std::unordered_map<mj_gno_t, mj_lno_t> localGidToLid;
9035 localGidToLid.reserve(result_num_local_coords);
9036 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> host_result_initial_mj_gnos(
9037 Kokkos::ViewAllocateWithoutInitializing(
"host_result_initial_mj_gnos"),
9038 result_initial_mj_gnos_.size());
9039 Kokkos::deep_copy(host_result_initial_mj_gnos, result_initial_mj_gnos_);
9040 for(mj_lno_t i = 0; i < result_num_local_coords; i++) {
9041 localGidToLid[host_result_initial_mj_gnos(i)] = i;
9044 ArrayRCP<mj_part_t> partId = arcp(
new mj_part_t[result_num_local_coords],
9045 0, result_num_local_coords,
true);
9046 auto host_result_assigned_part_ids =
9047 Kokkos::create_mirror_view(result_assigned_part_ids);
9048 Kokkos::deep_copy(host_result_assigned_part_ids, result_assigned_part_ids);
9049 auto host_result_mj_gnos = Kokkos::create_mirror_view(result_mj_gnos);
9050 Kokkos::deep_copy(host_result_mj_gnos, result_mj_gnos);
9051 for(mj_lno_t i = 0; i < result_num_local_coords; i++) {
9052 mj_lno_t origLID = localGidToLid[host_result_mj_gnos(i)];
9053 partId[origLID] = host_result_assigned_part_ids(i);
9058 if(is_pre_migrated) {
9059 this->mj_env->timerStart(
MACRO_TIMERS, timer_base_string +
9060 "PostMigration DistributorPlanCreating");
9061 Tpetra::Distributor distributor(this->mj_problemComm);
9063 ArrayView<const mj_part_t> actual_owner_destinations(
9064 result_actual_owner_rank , result_num_local_coords);
9066 mj_lno_t num_incoming_gnos = distributor.createFromSends(
9067 actual_owner_destinations);
9069 if(num_incoming_gnos != this->num_local_coords) {
9070 throw std::logic_error(
"Zoltan2 - Multijagged Post Migration - "
9071 "num incoming is not equal to num local coords");
9075 "PostMigration DistributorPlanCreating");
9077 "PostMigration DistributorMigration");
9079 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> received_gnos(
9080 Kokkos::ViewAllocateWithoutInitializing(
"received_gnos"),
9082 Kokkos::View<mj_part_t*, Kokkos::HostSpace> received_partids(
9083 Kokkos::ViewAllocateWithoutInitializing(
"received_partids"),
9086 distributor.doPostsAndWaits(host_result_initial_mj_gnos, 1,
9089 Kokkos::View<mj_part_t*, Kokkos::HostSpace> sent_partnos;
9090 if (partId.size() > 0) {
9091 sent_partnos = Kokkos::View<mj_part_t*, Kokkos::HostSpace>(
9092 partId.getRawPtr(), partId.size());
9094 distributor.doPostsAndWaits(sent_partnos, 1, received_partids);
9097 partId = arcp(
new mj_part_t[this->num_local_coords],
9098 0, this->num_local_coords,
true);
9101 std::unordered_map<mj_gno_t, mj_lno_t> localGidToLid2;
9102 localGidToLid2.reserve(this->num_local_coords);
9103 auto host_initial_mj_gnos =
9104 Kokkos::create_mirror_view(this->initial_mj_gnos);
9105 Kokkos::deep_copy(host_initial_mj_gnos,
9106 this->initial_mj_gnos);
9107 for(mj_lno_t i = 0; i < this->num_local_coords; i++) {
9108 localGidToLid2[host_initial_mj_gnos(i)] = i;
9111 for(mj_lno_t i = 0; i < this->num_local_coords; i++) {
9112 mj_lno_t origLID = localGidToLid2[received_gnos[i]];
9113 partId[origLID] = received_partids[i];
9118 delete [] result_actual_owner_rank;
9121 timer_base_string +
"PostMigration DistributorMigration");
9123 solution->setParts(partId);
9124 this->mj_env->timerStop(
MACRO_TIMERS, timer_base_string +
"cleanup");
9127 this->mj_env->timerStop(
MACRO_TIMERS, timer_base_string +
"all");
9130 this->mj_coordinates = Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t>();
9135 template <
typename Adapter>
9148 int criteria_dim = (this->num_weights_per_coord ?
9149 this->num_weights_per_coord : 1);
9153 this->num_global_parts = solution->getTargetGlobalNumberOfParts();
9156 this->mj_uniform_parts = Kokkos::View<bool *, Kokkos::HostSpace>(
9157 "uniform parts", criteria_dim);
9158 this->mj_uniform_weights = Kokkos::View<bool *, Kokkos::HostSpace>(
9159 "uniform weights", criteria_dim);
9161 Kokkos::View<const mj_gno_t *, device_t> gnos;
9162 Kokkos::View<adapter_scalar_t **, Kokkos::LayoutLeft, device_t> xyz_adapter;
9164 Kokkos::View<adapter_scalar_t **, device_t> wgts_adapter;
9167 Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t> xyz;
9168 Kokkos::View<mj_scalar_t **, device_t> wgts;
9172 if(std::is_same<mj_scalar_t, adapter_scalar_t>()) {
9175 assign_if_same(xyz, xyz_adapter);
9176 assign_if_same(wgts, wgts_adapter);
9181 xyz = Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t>
9182 (Kokkos::ViewAllocateWithoutInitializing(
9183 "xyz"), xyz_adapter.extent(0), xyz_adapter.extent(1));
9184 wgts = Kokkos::View<mj_scalar_t **, device_t>(
9185 Kokkos::ViewAllocateWithoutInitializing(
"wgts"),
9186 wgts_adapter.extent(0), wgts_adapter.extent(1));
9188 typedef typename Kokkos::View<mj_scalar_t **, device_t>::size_type view_size_t;
9189 Kokkos::parallel_for(
9190 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
9191 (0, xyz_adapter.extent(0)), KOKKOS_LAMBDA (
int i) {
9192 for(view_size_t n = 0; n < xyz_adapter.extent(1); ++n) {
9193 xyz(i, n) =
static_cast<mj_scalar_t
>(xyz_adapter(i, n));
9196 Kokkos::parallel_for(
9197 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
9198 (0, wgts.extent(0)), KOKKOS_LAMBDA (
int i) {
9199 for(view_size_t n = 0; n < wgts.extent(1); ++n) {
9200 wgts(i, n) =
static_cast<mj_scalar_t
>(wgts_adapter(i, n));
9206 this->initial_mj_gnos = gnos;
9208 this->mj_coordinates = xyz;
9211 if(this->num_weights_per_coord == 0) {
9212 this->mj_uniform_weights(0) =
true;
9213 Kokkos::resize(this->mj_weights, 0, 0);
9216 this->mj_weights = wgts;
9217 for(
int wdim = 0; wdim < this->num_weights_per_coord; ++wdim) {
9218 this->mj_uniform_weights(wdim) =
false;
9222 for(
int wdim = 0; wdim < criteria_dim; ++wdim) {
9223 if(solution->criteriaHasUniformPartSizes(wdim)) {
9224 this->mj_uniform_parts(wdim) =
true;
9227 printf(
"Error: MJ does not support non uniform target part weights\n");
9236 template <
typename Adapter>
9238 const Teuchos::ParameterList &pl)
9240 const Teuchos::ParameterEntry *pe = pl.getEntryPtr(
"imbalance_tolerance");
9243 tol = pe->getValue(&tol);
9244 this->imbalance_tolerance = tol - 1.0;
9248 if(this->imbalance_tolerance <= 0) {
9249 this->imbalance_tolerance= 10e-4;
9253 Kokkos::resize(this->part_no_array, 0);
9256 this->recursion_depth = 0;
9258 if(pl.getPtr<
int>(
"mj_num_teams")) {
9259 this->num_teams = pl.get<
int>(
"mj_num_teams");
9262 if(pl.getPtr<Array <mj_part_t> >(
"mj_parts")) {
9263 auto mj_parts = pl.get<Array <mj_part_t> >(
"mj_parts");
9264 int mj_parts_size =
static_cast<int>(mj_parts.size());
9267 this->part_no_array = Kokkos::View<mj_part_t*, Kokkos::HostSpace>(
9268 "part_no_array", mj_parts_size);
9269 for(
int i = 0; i < mj_parts_size; ++i) {
9270 this->part_no_array(i) = mj_parts.getRawPtr()[i];
9273 this->recursion_depth = mj_parts_size - 1;
9274 this->mj_env->debug(2,
"mj_parts provided by user");
9278 this->distribute_points_on_cut_lines =
true;
9279 this->max_concurrent_part_calculation = 1;
9281 this->mj_run_as_rcb =
false;
9282 this->mj_premigration_option = 0;
9283 this->min_coord_per_rank_for_premigration = 32000;
9285 int mj_user_recursion_depth = -1;
9286 this->mj_keep_part_boxes =
false;
9287 this->check_migrate_avoid_migration_option = 0;
9288 this->migration_type = 0;
9289 this->minimum_migration_imbalance = 0.35;
9291 pe = pl.getEntryPtr(
"mj_minimum_migration_imbalance");
9294 imb = pe->getValue(&imb);
9295 this->minimum_migration_imbalance = imb - 1.0;
9298 pe = pl.getEntryPtr(
"mj_migration_option");
9300 this->check_migrate_avoid_migration_option =
9301 pe->getValue(&this->check_migrate_avoid_migration_option);
9303 this->check_migrate_avoid_migration_option = 0;
9305 if(this->check_migrate_avoid_migration_option > 1) {
9306 this->check_migrate_avoid_migration_option = -1;
9310 pe = pl.getEntryPtr(
"mj_migration_type");
9312 this->migration_type = pe->getValue(&this->migration_type);
9314 this->migration_type = 0;
9320 pe = pl.getEntryPtr(
"mj_concurrent_part_count");
9322 this->max_concurrent_part_calculation =
9323 pe->getValue(&this->max_concurrent_part_calculation);
9325 this->max_concurrent_part_calculation = 1;
9328 pe = pl.getEntryPtr(
"mj_keep_part_boxes");
9330 this->mj_keep_part_boxes = pe->getValue(&this->mj_keep_part_boxes);
9332 this->mj_keep_part_boxes =
false;
9343 if(this->mj_keep_part_boxes ==
false) {
9344 pe = pl.getEntryPtr(
"mapping_type");
9346 int mapping_type = -1;
9347 mapping_type = pe->getValue(&mapping_type);
9348 if(mapping_type == 0) {
9349 mj_keep_part_boxes =
true;
9355 pe = pl.getEntryPtr(
"mj_enable_rcb");
9357 this->mj_run_as_rcb = pe->getValue(&this->mj_run_as_rcb);
9359 this->mj_run_as_rcb =
false;
9362 pe = pl.getEntryPtr(
"mj_premigration_option");
9364 mj_premigration_option = pe->getValue(&mj_premigration_option);
9366 mj_premigration_option = 0;
9369 pe = pl.getEntryPtr(
"mj_premigration_coordinate_count");
9371 min_coord_per_rank_for_premigration = pe->getValue(&mj_premigration_option);
9373 min_coord_per_rank_for_premigration = 32000;
9376 pe = pl.getEntryPtr(
"mj_recursion_depth");
9378 mj_user_recursion_depth = pe->getValue(&mj_user_recursion_depth);
9380 mj_user_recursion_depth = -1;
9384 pe = pl.getEntryPtr(
"rectilinear");
9386 val = pe->getValue(&val);
9389 this->distribute_points_on_cut_lines =
false;
9391 this->distribute_points_on_cut_lines =
true;
9394 if(this->mj_run_as_rcb) {
9395 mj_user_recursion_depth =
9396 (int)(ceil(log ((this->num_global_parts)) / log (2.0)));
9398 if(this->recursion_depth < 1) {
9399 if(mj_user_recursion_depth > 0) {
9400 this->recursion_depth = mj_user_recursion_depth;
9403 this->recursion_depth = this->coord_dim;
9409 template <
typename Adapter>
9412 adapter_scalar_t *lower,
9413 adapter_scalar_t *upper,
9414 size_t &nPartsFound,
9424 if(this->mj_keep_part_boxes) {
9427 RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
9429 size_t nBoxes = (*partBoxes).size();
9431 throw std::logic_error(
"no part boxes exist");
9435 RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
9437 if(globalBox->boxesOverlap(dim, lower, upper)) {
9439 std::vector<typename Adapter::part_t> partlist;
9442 for(
size_t i = 0; i < nBoxes; i++) {
9444 if((*partBoxes)[i].boxesOverlap(dim, lower, upper)) {
9446 partlist.push_back((*partBoxes)[i].getpId());
9468 *partsFound =
new mj_part_t[nPartsFound];
9469 for(
size_t i = 0; i < nPartsFound; i++)
9470 (*partsFound)[i] = partlist[i];
9482 throw std::logic_error(
"need to use keep_cuts parameter for boxAssign");
9487 template <
typename Adapter>
9490 adapter_scalar_t *point)
const
9496 if(this->mj_keep_part_boxes) {
9500 RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
9502 size_t nBoxes = (*partBoxes).size();
9504 throw std::logic_error(
"no part boxes exist");
9508 RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
9510 if(globalBox->pointInBox(dim, point)) {
9514 for(i = 0; i < nBoxes; i++) {
9516 if((*partBoxes)[i].pointInBox(dim, point)) {
9517 foundPart = (*partBoxes)[i].getpId();
9531 std::ostringstream oss;
9533 for(
int j = 0; j < dim; j++) oss << point[j] <<
" ";
9534 oss <<
") not found in domain";
9535 throw std::logic_error(oss.str());
9545 size_t closestBox = 0;
9546 coord_t minDistance = std::numeric_limits<coord_t>::max();
9547 coord_t *centroid =
new coord_t[dim];
9548 for(
size_t i = 0; i < nBoxes; i++) {
9549 (*partBoxes)[i].computeCentroid(centroid);
9552 for(
int j = 0; j < dim; j++) {
9553 diff = centroid[j] - point[j];
9556 if(sum < minDistance) {
9561 foundPart = (*partBoxes)[closestBox].getpId();
9568 throw std::logic_error(
"need to use keep_cuts parameter for pointAssign");
9572 template <
typename Adapter>
9578 if(comXAdj_.getRawPtr() == NULL && comAdj_.getRawPtr() == NULL) {
9579 RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
9580 mj_part_t ntasks = (*pBoxes).size();
9581 int dim = (*pBoxes)[0].getDim();
9582 GridHash grid(pBoxes, ntasks, dim);
9589 template <
typename Adapter>
9590 RCP<typename Zoltan2_AlgMJ<Adapter>::mj_partBoxVector_t>
9593 return this->mj_partitioner.get_kept_boxes();
Zoltan2_MJArrayType< scalar_t > value_type
KOKKOS_INLINE_FUNCTION Zoltan2_MJArrayType()
KOKKOS_INLINE_FUNCTION void init(value_type dst) const
Kokkos::View< index_t *, device_t > part_xadj
GridHash Class, Hashing Class for part boxes.
void set_up_partitioning_data(const RCP< PartitioningSolution< Adapter > > &solution)
Time an algorithm (or other entity) as a whole.
global_size_t getGlobalNumCoordinates() const
Returns the global number coordinates.
void set(IT index_, CT count_, WT *vals_)
KOKKOS_INLINE_FUNCTION void join(value_type &dst, const value_type &src) const
int value_count_rightleft
#define Z2_FORWARD_EXCEPTIONS
Forward an exception back through call stack.
KOKKOS_INLINE_FUNCTION void operator()(const member_type &teamMember, value_type teamSum) const
Kokkos::View< index_t *, device_t > track_on_cuts
static int get_counter_AlgMJ()
Defines Parameter related enumerators, declares functions.
KOKKOS_INLINE_FUNCTION ArrayReducer(value_type &val, int mj_value_count)
static RCP< Teuchos::BoolParameterEntryValidator > getBoolValidator()
Exists to make setting up validators less cluttered.
KOKKOS_INLINE_FUNCTION void join(value_type dst, const value_type src) const
void getAdjArrays(ArrayRCP< part_t > &comXAdj_, ArrayRCP< part_t > &comAdj_)
GridHash Class, returns the adj arrays.
Zoltan2_AlgMJ(const RCP< const Environment > &env, RCP< const Comm< int > > &problemComm, const RCP< const typename Adapter::base_adapter_t > &adapter)
Zoltan2_BoxBoundaries(Ordinal s_)
Constructor.
Kokkos::View< scalar_t *, device_t > coordinates
Sort items for quick sort function.
std::bitset< NUM_MODEL_FLAGS > modelFlag_t
void uqSignsort(IT n, uSignedSortItem< IT, WT, SIGN > *arr)
Quick sort function. Sorts the arr of uSignedSortItems, with respect to increasing vals...
Kokkos::View< index_t *, device_t > permutations
map_t::global_ordinal_type gno_t
Class for sorting items with multiple values. First sorting with respect to val[0], then val[1] then ... val[count-1]. The last tie breaking is done with index values. Used for task mapping partitioning where the points on a cut line needs to be distributed consistently.
static RCP< Teuchos::AnyNumberParameterEntryValidator > getAnyDoubleValidator()
Exists to make setting up validators less cluttered.
Kokkos::View< scalar_t **, device_t > weights
KOKKOS_INLINE_FUNCTION void join(value_type dst, const value_type src) const
ArrayCombinationReducer reducer
void partition(const RCP< PartitioningSolution< Adapter > > &solution)
Multi Jagged coordinate partitioning algorithm.
void reduce(const Ordinal count, const T inBuffer[], T inoutBuffer[]) const
Implement Teuchos::ValueTypeReductionOp interface.
KOKKOS_INLINE_FUNCTION value_type & reference() const
Kokkos::View< scalar_t * > scalar_view_t
coordinateModelPartBox Class, represents the boundaries of the box which is a result of a geometric p...
KOKKOS_INLINE_FUNCTION void init(value_type &dst) const
void set_to_keep_part_boxes()
Function call, if the part boxes are intended to be kept.
A ParameterList validator for integer range lists.
static int get_counter_Zoltan2_AlgMJ()
void getCommunicationGraph(const PartitioningSolution< Adapter > *solution, ArrayRCP< mj_part_t > &comXAdj, ArrayRCP< mj_part_t > &comAdj)
returns communication graph resulting from MJ partitioning.
SparseMatrixAdapter_t::part_t part_t
Multi Jagged coordinate partitioning algorithm.
This class provides geometric coordinates with optional weights to the Zoltan2 algorithm.
policy_t::member_type member_type
void boxAssign(int dim, adapter_scalar_t *lower, adapter_scalar_t *upper, size_t &nPartsFound, mj_part_t **partsFound) const
Kokkos::View< scalar_t *, device_t > cut_coordinates
A PartitioningSolution is a solution to a partitioning problem.
Zoltan2_BoxBoundaries()
Default Constructor.
Kokkos::View< index_t *, device_t > permutations
void set_partitioning_parameters(bool distribute_points_on_cut_lines_, int max_concurrent_part_calculation_, int check_migrate_avoid_migration_option_, double minimum_migration_imbalance_, int migration_type_=0)
Multi Jagged coordinate partitioning algorithm.
KOKKOS_INLINE_FUNCTION value_type & reference() const
size_t getCoordinatesKokkos(Kokkos::View< const gno_t *, typename node_t::device_type > &Ids, Kokkos::View< scalar_t **, Kokkos::LayoutLeft, typename node_t::device_type > &xyz, Kokkos::View< scalar_t **, typename node_t::device_type > &wgts) const
Returns the coordinate ids, values and optional weights.
AlgMJ()
Multi Jagged coordinate partitioning algorithm default constructor.
Kokkos::View< part_t *, device_t > parts
Zoltan2_BoxBoundaries is a reduction operation to all reduce the all box boundaries.
ReduceWeightsFunctor(int mj_loop_count, array_t mj_max_scalar, part_t mj_concurrent_current_part, part_t mj_num_cuts, part_t mj_current_work_part, part_t mj_current_concurrent_num_parts, part_t mj_left_right_array_size, part_t mj_weight_array_size, Kokkos::View< index_t *, device_t > &mj_permutations, Kokkos::View< scalar_t *, device_t > &mj_coordinates, Kokkos::View< scalar_t **, device_t > &mj_weights, Kokkos::View< part_t *, device_t > &mj_parts, Kokkos::View< scalar_t *, device_t > &mj_cut_coordinates, Kokkos::View< index_t *, device_t > &mj_part_xadj, bool mj_uniform_weights0, scalar_t mj_sEpsilon)
Algorithm defines the base class for all algorithms.
map_t::local_ordinal_type lno_t
mj_partBoxVector_t & getPartBoxesView() const
for partitioning methods, return bounding boxes of the
#define Z2_ASSERT_VALUE(actual, expected)
Throw an error when actual value is not equal to expected value.
RCP< mj_partBoxVector_t > get_kept_boxes() const
DOCWORK: Documentation.
static RCP< Teuchos::AnyNumberParameterEntryValidator > getAnyIntValidator()
Exists to make setting up validators less cluttered.
part_t concurrent_current_part
Kokkos::View< part_t *, device_t > parts
RCP< mj_partBoxVector_t > compute_global_box_boundaries(RCP< mj_partBoxVector_t > &localPartBoxes) const
DOCWORK: Documentation.
uMultiSortItem(IT index_, CT count_, WT *vals_)
KOKKOS_INLINE_FUNCTION ArrayCombinationReducer(scalar_t mj_max_scalar, value_type &val, int mj_value_count_rightleft, int mj_value_count_weights)
KOKKOS_INLINE_FUNCTION void init(value_type dst) const
size_t getLocalNumCoordinates() const
Returns the number of coordinates on this process.
int getNumWeightsPerCoordinate() const
Returns the number (0 or greater) of weights per coordinate.
Define IntegerRangeList validator.
size_t team_shmem_size(int team_size) const
Defines the CoordinateModel classes.
void multi_jagged_part(const RCP< const Environment > &env, RCP< const Comm< int > > &problemComm, double imbalance_tolerance, int num_teams, size_t num_global_parts, Kokkos::View< mj_part_t *, Kokkos::HostSpace > &part_no_array, int recursion_depth, int coord_dim, mj_lno_t num_local_coords, mj_gno_t num_global_coords, Kokkos::View< const mj_gno_t *, device_t > &initial_mj_gnos, Kokkos::View< mj_scalar_t **, Kokkos::LayoutLeft, device_t > &mj_coordinates, int num_weights_per_coord, Kokkos::View< bool *, Kokkos::HostSpace > &mj_uniform_weights, Kokkos::View< mj_scalar_t **, device_t > &mj_weights, Kokkos::View< bool *, Kokkos::HostSpace > &mj_uniform_parts, Kokkos::View< mj_part_t *, device_t > &result_assigned_part_ids, Kokkos::View< mj_gno_t *, device_t > &result_mj_gnos)
Multi Jagged coordinate partitioning algorithm.
part_t concurrent_current_part
KOKKOS_INLINE_FUNCTION void join(value_type &dst, const value_type &src) const
Kokkos::View< scalar_t * > scalar_view_t
policy_t::member_type member_type
Tpetra::global_size_t global_size_t
Zoltan2_MJArrayType< scalar_t > value_type
part_t current_concurrent_num_parts
mj_part_t pointAssign(int dim, adapter_scalar_t *point) const
KOKKOS_INLINE_FUNCTION void init(value_type &dst) const
#define Z2_THROW_OUTSIDE_ERROR(env)
Throw an error returned from outside the Zoltan2 library.
int value_count_rightleft
int getCoordinateDim() const
Returns the dimension of the coordinates.
Kokkos::View< scalar_t *, device_t > coordinates
ReduceArrayFunctor(part_t mj_concurrent_current_part, part_t mj_weight_array_size, Kokkos::View< index_t *, device_t > &mj_permutations, Kokkos::View< scalar_t *, device_t > &mj_coordinates, Kokkos::View< part_t *, device_t > &mj_parts, Kokkos::View< index_t *, device_t > &mj_part_xadj, Kokkos::View< index_t *, device_t > &mj_track_on_cuts)
Kokkos::View< index_t *, device_t > part_xadj
KOKKOS_INLINE_FUNCTION Zoltan2_MJArrayType(scalar_t *pSetPtr)
A gathering of useful namespace methods.
void uqsort(IT n, uSortItem< IT, WT > *arr)
Quick sort function. Sorts the arr of uSortItems, with respect to increasing vals. DOCWORK: Document input params.
Contains Teuchos redcution operators for the Multi-jagged algorthm.
RCP< mj_partBox_t > get_global_box() const
DOCWORK: Documentation.
KOKKOS_INLINE_FUNCTION void join(volatile value_type &dst, const volatile value_type &src) const
size_t team_shmem_size(int team_size) const
Multi Jagged coordinate partitioning algorithm.
static void getValidParameters(ParameterList &pl)
Set up validators specific to this algorithm.
void sequential_task_partitioning(const RCP< const Environment > &env, mj_lno_t num_total_coords, mj_lno_t num_selected_coords, size_t num_target_part, int coord_dim, Kokkos::View< mj_scalar_t **, Kokkos::LayoutLeft, device_t > &mj_coordinates_, Kokkos::View< mj_lno_t *, device_t > &initial_selected_coords_output_permutation, mj_lno_t *output_xadj, int recursion_depth_, const Kokkos::View< mj_part_t *, Kokkos::HostSpace > &part_no_array, bool partition_along_longest_dim, int num_ranks_per_node, bool divide_to_prime_first_, mj_part_t num_first_level_parts_=1, const Kokkos::View< mj_part_t *, Kokkos::HostSpace > &first_level_distribution_=Kokkos::View< mj_part_t *, Kokkos::HostSpace >())
Special function for partitioning for task mapping. Runs sequential, and performs deterministic parti...