Zoltan2
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
Zoltan2_AlgMultiJagged.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Zoltan2: A package of combinatorial algorithms for scientific computing
4 //
5 // Copyright 2012 NTESS and the Zoltan2 contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
14 #ifndef _ZOLTAN2_ALGMultiJagged_HPP_
15 #define _ZOLTAN2_ALGMultiJagged_HPP_
16 
19 #include <Zoltan2_Parameters.hpp>
20 #include <Zoltan2_Algorithm.hpp>
23 #include <Zoltan2_Util.hpp>
24 #include <Tpetra_Distributor.hpp>
25 #include <Teuchos_StandardParameterEntryValidators.hpp>
26 #include <Teuchos_ParameterList.hpp>
27 #include <Kokkos_Sort.hpp>
28 
29 #include <algorithm> // std::sort
30 #include <vector>
31 #include <unordered_map>
32 
33 #ifdef ZOLTAN2_USEZOLTANCOMM
34 #ifdef HAVE_ZOLTAN2_MPI
35 #define ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
36 #include "zoltan_comm_cpp.h"
37 #include "zoltan_types.h" // for error codes
38 #endif
39 #endif
40 
41 namespace Teuchos{
42 
46 template <typename Ordinal, typename T>
47 class Zoltan2_BoxBoundaries : public ValueTypeReductionOp<Ordinal,T>
48 {
49 private:
50  Ordinal size;
51  T epsilon;
52 
53 public:
56  Zoltan2_BoxBoundaries() : size(0),
57  epsilon(std::numeric_limits<T>::epsilon()) {}
58 
62  Zoltan2_BoxBoundaries(Ordinal s_):
63  size(s_), epsilon(std::numeric_limits<T>::epsilon()) {}
64 
70  void reduce( const Ordinal count, const T inBuffer[], T inoutBuffer[]) const {
71  for(Ordinal i = 0; i < count; i++) {
72  if(Z2_ABS(inBuffer[i]) > epsilon) {
73  inoutBuffer[i] = inBuffer[i];
74  }
75  }
76  }
77 };
78 
79 } // namespace Teuchos
80 
81 namespace Zoltan2{
82 
89 template <typename IT, typename CT, typename WT>
91 {
92 public:
93  // TODO: Why volatile?
94  // no idea, another intel compiler failure.
95  volatile IT index;
96  volatile CT count;
97  volatile WT *val;
98  volatile WT epsilon;
99 
101  this->index = 0;
102  this->count = 0;
103  this->val = NULL;
105  }
106 
107  // TODO: Document these methods?
108  uMultiSortItem(IT index_ ,CT count_, WT *vals_) {
109  this->index = index_;
110  this->count = count_;
111  this->val = vals_;
113  }
114 
116  }
117 
118  void set(IT index_ ,CT count_, WT *vals_) {
119  this->index = index_;
120  this->count = count_;
121  this->val = vals_;
122  }
123 
124  bool operator<(const uMultiSortItem<IT,CT,WT>& other) const {
125  assert(this->count == other.count);
126  for(CT i = 0; i < this->count; ++i) {
127  // if the values are equal go to next one.
128  if(std::abs(this->val[i] - other.val[i]) < this->epsilon) {
129  continue;
130  }
131  // if next value is smaller return true;
132  if(this->val[i] < other.val[i]) {
133  return true;
134  }
135  // if next value is bigger return false;
136  else {
137  return false;
138  }
139  }
140  // if they are totally equal.
141  return this->index < other.index;
142  }
143 };
144 
147 template <class IT, class WT>
148 struct uSortItem
149 {
150  IT id;
151  WT val;
152 };
153 
158 template <class IT, class WT>
159 void uqsort(IT n, uSortItem<IT, WT> * arr) {
160  const int NSTACK = 50;
161  int M = 7;
162  IT i, ir=n, j, k, l=1;
163  IT jstack=0, istack[NSTACK];
164  WT aval;
166 
167  --arr;
168  for(;;) {
169  if(ir-l < M) {
170  for(j=l+1;j<=ir;j++) {
171  a=arr[j];
172  aval = a.val;
173  for(i=j-1;i>=1;i--) {
174  if(arr[i].val <= aval)
175  break;
176  arr[i+1] = arr[i];
177  }
178  arr[i+1]=a;
179  }
180  if(jstack == 0)
181  break;
182  ir=istack[jstack--];
183  l=istack[jstack--];
184  }
185  else {
186  k=(l+ir) >> 1;
187  std::swap(arr[k],arr[l+1]);
188  if(arr[l+1].val > arr[ir].val) {
189  std::swap(arr[l+1],arr[ir]);
190  }
191  if(arr[l].val > arr[ir].val) {
192  std::swap(arr[l],arr[ir]);
193  }
194  if(arr[l+1].val > arr[l].val) {
195  std::swap(arr[l+1],arr[l]);
196  }
197  i=l+1;
198  j=ir;
199  a=arr[l];
200  aval = a.val;
201  for(;;) {
202  do i++; while (arr[i].val < aval);
203  do j--; while (arr[j].val > aval);
204  if(j < i) break;
205  std::swap(arr[i],arr[j]);
206  }
207  arr[l]=arr[j];
208  arr[j]=a;
209  jstack += 2;
210  if(jstack > NSTACK) {
211  std::cout << "uqsort: NSTACK too small in sort." << std::endl;
212  std::terminate();
213  }
214  if(ir-i+1 >= j-l) {
215  istack[jstack]=ir;
216  istack[jstack-1]=i;
217  ir=j-1;
218  }
219  else {
220  istack[jstack]=j-1;
221  istack[jstack-1]=l;
222  l=i;
223  }
224  }
225  }
226 }
227 
228 template <class IT, class WT, class SIGN>
230 {
231  IT id;
232  WT val;
233  SIGN signbit; // 1 means positive, 0 means negative.
234  bool operator<(const uSignedSortItem<IT, WT, SIGN>& rhs) const {
235  /*if I am negative, the other is positive*/
236  if(this->signbit < rhs.signbit) {
237  return true;
238  }
239  /*if both has the same sign*/
240  else if(this->signbit == rhs.signbit) {
241  if(this->val < rhs.val) {//if my value is smaller,
242  return this->signbit;//then if we both are positive return true.
243  //if we both are negative, return false.
244  }
245  else if(this->val > rhs.val) {//if my value is larger,
246  return !this->signbit; //then if we both are positive return false.
247  //if we both are negative, return true.
248  }
249  else { //if both are equal.
250  return false;
251  }
252  }
253  else {
254  /*if I am positive, the other is negative*/
255  return false;
256  }
257  }
258 
259  bool operator<=(const uSignedSortItem<IT, WT, SIGN>& rhs) {
260  return (this->val == rhs.val && this->signbit == rhs.signbit) || (*this < rhs);
261  }
262 };
263 
267 template <class IT, class WT, class SIGN>
269  const IT NSTACK = 50;
270  IT M = 7;
271  IT i, ir=n, j, k, l=1;
272  IT jstack=0, istack[NSTACK];
274 
275  --arr;
276  for(;;) {
277  if(ir < M + l) {
278  for(j=l+1;j<=ir;j++) {
279  a=arr[j];
280  for(i=j-1;i>=1;i--) {
281  if(arr[i] <= a) {
282  break;
283  }
284  arr[i+1] = arr[i];
285  }
286  arr[i+1]=a;
287  }
288  if(jstack == 0) {
289  break;
290  }
291  ir=istack[jstack--];
292  l=istack[jstack--];
293  }
294  else {
295  k=(l+ir) >> 1;
296  std::swap(arr[k],arr[l+1]);
297  if(arr[ir] < arr[l+1]) {
298  std::swap(arr[l+1],arr[ir]);
299  }
300  if(arr[ir] < arr[l] ) {
301  std::swap(arr[l],arr[ir]);
302  }
303  if(arr[l] < arr[l+1]) {
304  std::swap(arr[l+1],arr[l]);
305  }
306  i=l+1;
307  j=ir;
308  a=arr[l];
309  for(;;) {
310  do i++; while (arr[i] < a);
311  do j--; while (a < arr[j]);
312  if(j < i) break;
313  std::swap(arr[i],arr[j]);
314  }
315  arr[l]=arr[j];
316  arr[j]=a;
317  jstack += 2;
318  if(jstack > NSTACK) {
319  std::cout << "uqsort: NSTACK too small in sort." << std::endl;
320  std::terminate();
321  }
322  if(ir+l+1 >= j+i) {
323  istack[jstack]=ir;
324  istack[jstack-1]=i;
325  ir=j-1;
326  }
327  else {
328  istack[jstack]=j-1;
329  istack[jstack-1]=l;
330  l=i;
331  }
332  }
333  }
334 }
335 
336 // This exists only so we can track how many times the MJ algorithm is
337 // called and put each of those into different timer names.
338 // Currently the MultiJaggedTest.cpp will actually call it twice.
339 // First time with data from a Tpetra MultiVector and then a second time using
340 // a BasicVectorAdapter which allows us to turn UVM off for some tests. The
341 // results of the two runs are compared which helps to catch a lot of bugs. For
342 // profiling I'm mostly just interested in the UVM off case and need it to be
343 // in separate timers. Passing a value through would mess up the API. Possibly
344 // we could check the Adapter and use that. The statics have to be outside the
345 // templated class as the two called instances will be different template
346 // parameters. Another complication is that MultiJagged.cpp will call through
347 // the Zoltan2_AlgMJ class and we want to time things in both classes. However
348 // TaskMapper will directly call AlgMJ so I made two counters for the two
349 // classes to make sure it was always correct. This does not impact any
350 // behavior and has the sole purpose of generating unique timer names. If you
351 // run an MJ test you'll see MJ(0) and MJ(1) in the names to distinguish the
352 // 1st and 2nd run. Right now only MultijaggedTest.cpp cares about this.
354  static int get_counter_AlgMJ() {
355  static int counter = 0;
356  return counter++;
357  }
359  static int counter = 0;
360  return counter++;
361  }
362 };
363 
366 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
367  typename mj_part_t, typename mj_node_t>
368 class AlgMJ
369 {
370 private:
371  typedef typename mj_node_t::device_type device_t; // for views
373  typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
374 
375  //if the (last dimension reduce all count) x the mpi world size
376  //estimated to be bigger than this number then migration will be forced
377  //in earlier iterations.
378  static constexpr size_t future_reduceall_cutoff = 1500000;
379 
380  //if parts right before last dimension are estimated to have less than
381  //MIN_WORK_LAST_DIM many coords, migration will be forced in earlier iterations.
382  static constexpr mj_lno_t min_work_last_dim = 1000;
383 
384  static constexpr mj_scalar_t least_signifiance = 0.0001;
385  static constexpr int significance_mul = 1000;
386 
387  std::string mj_timer_base_string; // for convenience making timer names
388 
389  RCP<const Environment> mj_env; // the environment object
390  RCP<const Comm<int> > mj_problemComm; // initial comm object
391  RCP<Comm<int> > comm; // comm object than can be altered during execution
392  double imbalance_tolerance; // input imbalance tolerance.
393  int recursion_depth; // number of steps that partitioning will be solved in.
394  int coord_dim; // coordinate dim
395  int num_weights_per_coord; // # of weights per coord
396  size_t initial_num_loc_coords; // initial num local coords.
397  global_size_t initial_num_glob_coords; // initial num global coords.
398  mj_lno_t num_local_coords; // number of local coords.
399  mj_gno_t num_global_coords; // number of global coords.
400  mj_scalar_t sEpsilon; // epsilon for mj_scalar_t
401 
402  // can distribute points on same coordinant to different parts.
403  bool distribute_points_on_cut_lines;
404 
405  // how many parts we can calculate concurrently.
406  mj_part_t max_concurrent_part_calculation;
407 
408  bool mj_run_as_rcb; // means recursion depth is adjusted to maximum value.
409  int mj_user_recursion_depth; // the recursion depth value provided by user.
410  bool mj_keep_part_boxes; // if the boxes need to be kept.
411 
412  // whether to migrate=1, avoid migrate=2, or leave decision to MJ=0
413  int check_migrate_avoid_migration_option;
414 
415  // when doing the migration, 0 will aim for perfect load-imbalance, 1 - will
416  // aim for minimized number of messages with possibly bad load-imbalance
417  int migration_type;
418 
419  // when MJ decides whether to migrate, the minimum imbalance for migration.
420  double minimum_migration_imbalance;
421 
422  // Nonuniform first level partitioning
423  // (Currently available only for sequential_task_partitioning):
424  // Used for Dragonfly task mapping by partitioning Dragonfly RCA
425  // machine coordinates and application coordinates.
426  // An optimization that completely partitions the most important machine dimension
427  // first (i.e. the Dragonfly group coordinate, or RCA's x coordinate). The standard
428  // MJ alg follows after the nonuniform first level partitioning.
429  //
430  // Ex. (first level partitioning): If we have 120 elements,
431  // num_first_level_parts = 3, first_level_distribution = [4, 10, 6], then
432  // part sizes after first level will be [24, 60, 36]. Standard uniform MJ
433  // continues for all subsequent levels.
434 
435  // If used, number of parts requested for a nonuniform
436  // first level partitioning
437  mj_part_t num_first_level_parts;
438 
439  // If used, the requested distribution of parts for the
440  // nonuniform first level partitioning
441  Kokkos::View<mj_part_t*, Kokkos::HostSpace> first_level_distribution;
442 
443  mj_part_t total_num_cut ; // how many cuts will be totally
444  mj_part_t total_num_part; // how many parts will be totally
445 
446  mj_part_t max_num_part_along_dim ; // maximum part count along a dimension.
447  mj_part_t max_num_cut_along_dim; // maximum cut count along a dimension.
448 
449  // maximum part+cut count along a dimension.
450  size_t max_num_total_part_along_dim;
451 
452  mj_part_t total_dim_num_reduce_all; // estimate on #reduceAlls can be done.
453 
454  // max no of parts that might occur during the partition before the last
455  // partitioning dimension.
456  mj_part_t last_dim_num_part;
457 
458  // input part array specifying num part to divide along each dim.
459  Kokkos::View<mj_part_t *, Kokkos::HostSpace> part_no_array;
460 
461  // two dimension coordinate array
462  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
463  Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t>
464  mj_coordinates;
465 
466  // two dimension weight array
467  Kokkos::View<mj_scalar_t **, device_t> mj_weights;
468 
469  // if the target parts are uniform
470  Kokkos::View<bool *, Kokkos::HostSpace> mj_uniform_parts;
471 
472  // if the coordinates have uniform weights
473  Kokkos::View<bool *, Kokkos::HostSpace> mj_uniform_weights;
474 
475  int mj_num_teams; // the number of teams
476 
477  size_t num_global_parts; // the targeted number of parts
478 
479  // vector of all boxes for all parts, constructed if mj_keep_part_boxes true
480  RCP<mj_partBoxVector_t> kept_boxes;
481 
482  RCP<mj_partBox_t> global_box;
483 
484  int myRank; // processor rank
485  int myActualRank; // initial rank
486 
487  bool divide_to_prime_first;
488 
489  // initial global ids of the coordinates.
490  Kokkos::View<const mj_gno_t*, device_t> initial_mj_gnos;
491 
492  // current global ids of the coordinates, might change during migration.
493  Kokkos::View<mj_gno_t*, device_t> current_mj_gnos;
494 
495  // the actual processor owner of the coordinate, to track after migrations.
496  Kokkos::View<int*, Kokkos::HostSpace> owner_of_coordinate;
497 
498  // permutation of coordinates, for partitioning.
499  Kokkos::View<mj_lno_t*, device_t> coordinate_permutations;
500 
501  // permutation work array.
502  Kokkos::View<mj_lno_t*, device_t> new_coordinate_permutations;
503 
504  // the part ids assigned to coordinates.
505  Kokkos::View<mj_part_t*, device_t> assigned_part_ids;
506 
507  // beginning and end of each part.
508  Kokkos::View<mj_lno_t *, device_t> part_xadj;
509 
510  // work array for beginning and end of each part.
511  Kokkos::View<mj_lno_t *, device_t> new_part_xadj;
512 
513  Kokkos::View<mj_scalar_t *, device_t> all_cut_coordinates;
514 
515  // how much weight should a MPI put left side of the each cutline
516  Kokkos::View<mj_scalar_t *, device_t>
517  process_cut_line_weight_to_put_left;
518 
519  // weight percentage each thread in MPI puts left side of the each outline
520  Kokkos::View<mj_scalar_t *, device_t>
521  thread_cut_line_weight_to_put_left;
522 
523  // work array to manipulate coordinate of cutlines in different iterations.
524  // necessary because previous cut line information is used for determining
525  // the next cutline information. therefore, cannot update the cut work array
526  // until all cutlines are determined.
527  Kokkos::View<mj_scalar_t *, device_t> cut_coordinates_work_array;
528 
529  // Used for swapping above cut_coordinates_work_array
530  Kokkos::View<mj_scalar_t *, device_t> temp_cut_coords;
531 
532  // cumulative part weight array.
533  Kokkos::View<mj_scalar_t *, device_t> target_part_weights;
534 
535  // upper bound coordinate of a cut line
536  Kokkos::View<mj_scalar_t *, device_t> cut_upper_bound_coordinates;
537 
538  // lower bound coordinate of a cut line
539  Kokkos::View<mj_scalar_t *, device_t> cut_lower_bound_coordinates;
540 
541  // lower bound weight of a cut line
542  Kokkos::View<mj_scalar_t *, device_t> cut_lower_bound_weights;
543 
544  // upper bound weight of a cut line
545  Kokkos::View<mj_scalar_t *, device_t> cut_upper_bound_weights;
546 
547  // combined array to exchange the min and max coordinate, and total
548  // weight of part.
549  Kokkos::View<mj_scalar_t *, device_t>
550  process_local_min_max_coord_total_weight;
551 
552  // global combined array with the results for min, max and total weight.
553  Kokkos::View<mj_scalar_t *, device_t>
554  global_min_max_coord_total_weight;
555 
556  // isDone is used to determine if a cutline is determined already. If a cut
557  // line is already determined, the next iterations will skip this cut line.
558  Kokkos::View<bool *, device_t> is_cut_line_determined;
559 
560  // incomplete_cut_count count holds the number of cutlines that have not
561  // been finalized for each part when concurrentPartCount>1, using this
562  // information, if incomplete_cut_count[x]==0, then no work is done
563  // for this part.
564  Kokkos::View<mj_part_t *, device_t> device_incomplete_cut_count;
565  typename decltype(device_incomplete_cut_count)::HostMirror
566  incomplete_cut_count;
567 
568  // Need a quick accessor for this on host
569  typename decltype (part_xadj)::HostMirror host_part_xadj;
570 
571  // local part weights of each thread.
572  Kokkos::View<double *, device_t>
573  thread_part_weights;
574 
575  // the work manupulation array for partweights.
576  Kokkos::View<double *, device_t>
577  thread_part_weight_work;
578 
579  // thread_cut_left_closest_point to hold the closest coordinate
580  // to a cutline from left (for each thread).
581  Kokkos::View<mj_scalar_t *, device_t>
582  thread_cut_left_closest_point;
583 
584  // thread_cut_right_closest_point to hold the closest coordinate
585  // to a cutline from right (for each thread)
586  Kokkos::View<mj_scalar_t *, device_t>
587  thread_cut_right_closest_point;
588 
589  // to store how many points in each part a thread has.
590  Kokkos::View<mj_lno_t *, device_t>
591  thread_point_counts;
592 
593  Kokkos::View<mj_scalar_t *, device_t> process_rectilinear_cut_weight;
594  Kokkos::View<mj_scalar_t *, device_t> global_rectilinear_cut_weight;
595 
596  // for faster communication, concatanation of
597  // totalPartWeights sized 2P-1, since there are P parts and P-1 cut lines
598  // leftClosest distances sized P-1, since P-1 cut lines
599  // rightClosest distances size P-1, since P-1 cut lines.
600  Kokkos::View<mj_scalar_t *, device_t>
601  total_part_weight_left_right_closests;
602  Kokkos::View<mj_scalar_t *, device_t>
603  global_total_part_weight_left_right_closests;
604 
605  Kokkos::View<mj_part_t*, device_t> device_num_partitioning_in_current_dim;
606  typename decltype(device_num_partitioning_in_current_dim)::HostMirror
607  host_num_partitioning_in_current_dim; // for quick access on host
608 
609  /* \brief helper functio to calculate imbalance.
610  * \param achieved balance we achieved.
611  * \param expected balance expected.
612  */
613  static
614  KOKKOS_INLINE_FUNCTION
615  double calculate_imbalance(mj_scalar_t achieved, mj_scalar_t expected) {
616  return static_cast<double>(achieved) / static_cast<double>(expected) - 1.0;
617  }
618 
619  /* \brief Either the mj array (part_no_array) or num_global_parts should be
620  * provided in the input. part_no_array takes precedence if both are
621  * provided. Depending on these parameters, total cut/part number, maximum
622  * part/cut number along a dimension, estimated number of reduceAlls,
623  * and the number of parts before the last dimension is calculated.
624  * */
625  void set_part_specifications();
626 
627  /* \brief Tries to determine the part number for current dimension,
628  * by trying to make the partitioning as square as possible.
629  * \param num_total_future how many more partitionings are required.
630  * \param root how many more recursion depth is left.
631  */
632  inline mj_part_t get_part_count(
633  mj_part_t num_total_future,
634  double root);
635 
636  /* \brief for part communication we keep track of the box boundaries.
637  * This is performed when either asked specifically, or when geometric
638  * mapping is performed afterwards. This function initializes a single box
639  * with all global min and max coordinates.
640  * \param initial_partitioning_boxes the input and output vector for boxes.
641  */
642  void init_part_boxes(RCP<mj_partBoxVector_t> & outPartBoxes);
643 
644  /* \brief Function returns how many parts that will be obtained after this
645  * dimension partitioning. It sets how many parts each current part will be
646  * partitioned into in this dimension to device_num_partitioning_in_current_dim
647  * vector, sets how many total future parts each obtained part will be
648  * partitioned into in next_future_num_parts_in_parts vector, If part boxes
649  * are kept, then sets initializes the output_part_boxes as its ancestor.
650  * \param future_num_part_in_parts: input, how many future parts each
651  * current part will be partitioned into.
652  * \param next_future_num_parts_in_parts: output, how many future parts
653  * each obtained part will be partitioned into.
654  * \param future_num_parts: output, max number of future parts that will be
655  * obtained from a single
656  * \param current_num_parts: input, how many parts are there currently.
657  * \param current_iteration: input, current dimension iteration number.
658  * \param input_part_boxes: input, if boxes are kept, current boxes.
659  * \param output_part_boxes: output, if boxes are kept, the initial box
660  * boundaries for obtained parts.
661  * \param atomic_part_count // DOCWORK: Documentation
662  */
663  mj_part_t update_part_num_arrays(
664  std::vector<mj_part_t> *future_num_part_in_parts,
665  std::vector<mj_part_t> *next_future_num_parts_in_parts,
666  mj_part_t &future_num_parts,
667  mj_part_t current_num_parts,
668  int current_iteration,
669  RCP<mj_partBoxVector_t> input_part_boxes,
670  RCP<mj_partBoxVector_t> output_part_boxes,
671  mj_part_t atomic_part_count);
672 
684  static
685  KOKKOS_INLINE_FUNCTION
686  void mj_calculate_new_cut_position (
687  mj_scalar_t cut_upper_bound,
688  mj_scalar_t cut_lower_bound,
689  mj_scalar_t cut_upper_weight,
690  mj_scalar_t cut_lower_weight,
691  mj_scalar_t expected_weight,
692  mj_scalar_t &new_cut_position,
693  mj_scalar_t sEpsilon);
694 
719  bool mj_perform_migration(
720  mj_part_t in_num_parts, //current number of parts
721  mj_part_t &out_num_parts, //output number of parts.
722  std::vector<mj_part_t> *next_future_num_parts_in_parts,
723  mj_part_t &output_part_begin_index,
724  size_t migration_reduce_all_population,
725  mj_lno_t num_coords_for_last_dim_part,
726  std::string iteration,
727  RCP<mj_partBoxVector_t> &input_part_boxes,
728  RCP<mj_partBoxVector_t> &output_part_boxes);
729 
747  bool mj_check_to_migrate(
748  size_t migration_reduce_all_population,
749  mj_lno_t num_coords_for_last_dim_part,
750  mj_part_t num_procs,
751  mj_part_t num_parts,
752  mj_gno_t *num_points_in_all_processor_parts);
753 
778  void mj_migration_part_proc_assignment(
779  mj_gno_t * num_points_in_all_processor_parts,
780  mj_part_t num_parts,
781  mj_part_t num_procs,
782  mj_lno_t *send_count_to_each_proc,
783  std::vector<mj_part_t> &processor_ranks_for_subcomm,
784  std::vector<mj_part_t> *next_future_num_parts_in_parts,
785  mj_part_t &out_num_part,
786  std::vector<mj_part_t> &out_part_indices,
787  mj_part_t &output_part_numbering_begin_index,
788  int *coordinate_destinations);
789 
815  void mj_assign_proc_to_parts(
816  mj_gno_t * num_points_in_all_processor_parts,
817  mj_part_t num_parts,
818  mj_part_t num_procs,
819  mj_lno_t *send_count_to_each_proc,
820  std::vector<mj_part_t> &processor_ranks_for_subcomm,
821  std::vector<mj_part_t> *next_future_num_parts_in_parts,
822  mj_part_t &out_part_index,
823  mj_part_t &output_part_numbering_begin_index,
824  int *coordinate_destinations);
825 
841  void assign_send_destinations(
842  mj_part_t num_parts,
843  mj_part_t *part_assignment_proc_begin_indices,
844  mj_part_t *processor_chains_in_parts,
845  mj_lno_t *send_count_to_each_proc,
846  int *coordinate_destinations);
847 
862  void assign_send_destinations2(
863  mj_part_t num_parts,
864  uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment,
865  int *coordinate_destinations,
866  mj_part_t &output_part_numbering_begin_index,
867  std::vector<mj_part_t> *next_future_num_parts_in_parts);
868 
891  void mj_assign_parts_to_procs(
892  mj_gno_t * num_points_in_all_processor_parts,
893  mj_part_t num_parts,
894  mj_part_t num_procs,
895  mj_lno_t *send_count_to_each_proc,
896  std::vector<mj_part_t> *next_future_num_parts_in_parts,
897  mj_part_t &out_num_part,
898  std::vector<mj_part_t> &out_part_indices,
899  mj_part_t &output_part_numbering_begin_index,
900  int *coordinate_destinations);
901 
915  void mj_migrate_coords(
916  mj_part_t num_procs,
917  mj_lno_t &num_new_local_points,
918  std::string iteration,
919  int *coordinate_destinations,
920  mj_part_t num_parts);
921 
927  void create_sub_communicator(
928  std::vector<mj_part_t> &processor_ranks_for_subcomm);
929 
934  mj_part_t find_largest_prime_factor(mj_part_t num_parts) {
935  mj_part_t largest_factor = 1;
936  mj_part_t n = num_parts;
937  mj_part_t divisor = 2;
938  while (n > 1) {
939  while (n % divisor == 0) {
940  n = n / divisor;
941  largest_factor = divisor;
942  }
943  ++divisor;
944  if(divisor * divisor > n) {
945  if(n > 1) {
946  largest_factor = n;
947  }
948  break;
949  }
950  }
951  return largest_factor;
952  }
953 
954 public:
955  AlgMJ();
956 
957  // DOCWORK: Make param documentation use : consistently
983  void multi_jagged_part(
984  const RCP<const Environment> &env,
985  RCP<const Comm<int> > &problemComm,
986  double imbalance_tolerance,
987  int num_teams,
988  size_t num_global_parts,
989  Kokkos::View<mj_part_t*, Kokkos::HostSpace> & part_no_array,
990  int recursion_depth,
991  int coord_dim,
992  mj_lno_t num_local_coords,
993  mj_gno_t num_global_coords,
994  Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos,
995  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
996  Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> & mj_coordinates,
997  int num_weights_per_coord,
998  Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_weights,
999  Kokkos::View<mj_scalar_t**, device_t> & mj_weights,
1000  Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_parts,
1001  Kokkos::View<mj_part_t*, device_t> & result_assigned_part_ids,
1002  Kokkos::View<mj_gno_t*, device_t> & result_mj_gnos);
1003 
1017  bool distribute_points_on_cut_lines_,
1018  int max_concurrent_part_calculation_,
1019  int check_migrate_avoid_migration_option_,
1020  double minimum_migration_imbalance_,
1021  int migration_type_ = 0);
1022 
1025  void set_to_keep_part_boxes();
1026 
1029  RCP<mj_partBox_t> get_global_box() const;
1030 
1033  RCP<mj_partBoxVector_t> get_kept_boxes() const;
1034 
1037  RCP<mj_partBoxVector_t> compute_global_box_boundaries(
1038  RCP<mj_partBoxVector_t> &localPartBoxes) const;
1039 
1079  const RCP<const Environment> &env,
1080  mj_lno_t num_total_coords,
1081  mj_lno_t num_selected_coords,
1082  size_t num_target_part,
1083  int coord_dim,
1084  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
1085  Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t> & mj_coordinates_,
1086  Kokkos::View<mj_lno_t *, device_t> &
1087  initial_selected_coords_output_permutation,
1088  mj_lno_t *output_xadj,
1089  int recursion_depth_,
1090  const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & part_no_array,
1091  bool partition_along_longest_dim,
1092  int num_ranks_per_node,
1093  bool divide_to_prime_first_,
1094  mj_part_t num_first_level_parts_ = 1,
1095  const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & first_level_distribution_
1096  = Kokkos::View<mj_part_t *, Kokkos::HostSpace>());
1097 
1098 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
1099  public:
1100 #else
1101  private:
1102 #endif
1103 
1104  /* \brief Allocates all required memory for the mj partitioning algorithm.
1105  */
1106  void allocate_set_work_memory();
1107 
1108  /* \brief compute global bounding box: min/max coords of global domain */
1109  void compute_global_box();
1110 
1111  // DOCWORK: Inconsisent use of ! for descriptive/brief commenting - decide.
1118  void mj_get_local_min_max_coord_totW(
1119  mj_part_t current_work_part,
1120  mj_part_t current_concurrent_num_parts,
1121  Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords);
1122 
1135  void mj_get_global_min_max_coord_totW(
1136  mj_part_t current_concurrent_num_parts,
1137  Kokkos::View<mj_scalar_t *, device_t> & local_min_max_total,
1138  Kokkos::View<mj_scalar_t *, device_t> & global_min_max_total);
1139 
1170  void mj_get_initial_cut_coords_target_weights(
1171  mj_scalar_t min_coord,
1172  mj_scalar_t max_coord,
1173  mj_part_t num_cuts/*p-1*/ ,
1174  mj_scalar_t global_weight,
1175  Kokkos::View<mj_scalar_t *, device_t> & initial_cut_coords,
1176  Kokkos::View<mj_scalar_t *, device_t> & target_part_weights,
1177  std::vector <mj_part_t> *future_num_part_in_parts,
1178  std::vector <mj_part_t> *next_future_num_parts_in_parts,
1179  mj_part_t concurrent_current_part,
1180  mj_part_t obtained_part_index,
1181  mj_part_t num_target_first_level_parts = 1,
1182  const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & target_first_level_dist =
1183  Kokkos::View<mj_part_t *, Kokkos::HostSpace>());
1184 
1201  void set_initial_coordinate_parts(
1202  mj_scalar_t &max_coordinate,
1203  mj_scalar_t &min_coordinate,
1204  mj_lno_t coordinate_begin_index,
1205  mj_lno_t coordinate_end_index,
1206  Kokkos::View<mj_lno_t *, device_t> &
1207  mj_current_coordinate_permutations,
1208  Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1209  Kokkos::View<mj_part_t *, device_t> & mj_part_ids,
1210  mj_part_t &partition_count);
1211 
1228  void mj_1D_part(
1229  Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1230  double imbalanceTolerance,
1231  mj_part_t current_work_part,
1232  mj_part_t current_concurrent_num_parts,
1233  Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
1234  mj_part_t total_incomplete_cut_count,
1235  Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count,
1236  Kokkos::View<size_t*, device_t> & view_total_reduction_size);
1237 
1243  void mj_1D_part_get_part_weights(
1244  mj_part_t current_concurrent_num_parts,
1245  mj_part_t current_work_part,
1246  Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1247  int loop_count);
1248 
1256  void mj_combine_rightleft_and_weights(
1257  mj_part_t current_work_part,
1258  mj_part_t current_concurrent_num_parts);
1259 
1272  void mj_create_new_partitions(
1273  mj_part_t num_parts,
1274  mj_part_t current_concurrent_work_part,
1275  Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1276  Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
1277  Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
1278  Kokkos::View<mj_lno_t *, device_t> & out_part_xadj);
1279 
1315  void mj_get_new_cut_coordinates(
1316  mj_part_t current_concurrent_num_parts,
1317  mj_part_t kk,
1318  const mj_part_t &num_cuts,
1319  const double &used_imbalance_tolerance,
1320  Kokkos::View<mj_scalar_t *, device_t> & current_global_part_weights,
1321  Kokkos::View<mj_scalar_t *, device_t> & current_local_part_weights,
1322  Kokkos::View<mj_scalar_t *, device_t> & current_part_target_weights,
1323  Kokkos::View<bool *, device_t> & current_cut_line_determined,
1324  Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
1325  Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_bounds,
1326  Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bounds,
1327  Kokkos::View<mj_scalar_t *, device_t> & current_global_left_closest_points,
1328  Kokkos::View<mj_scalar_t *, device_t> & current_global_right_closest_points,
1329  Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bound_weights,
1330  Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_weights,
1331  Kokkos::View<mj_scalar_t *, device_t> & new_current_cut_coordinates,
1332  Kokkos::View<mj_scalar_t *, device_t> &
1333  current_part_cut_line_weight_to_put_left,
1334  Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count);
1335 
1345  void get_processor_num_points_in_parts(
1346  mj_part_t num_procs,
1347  mj_part_t num_parts,
1348  mj_gno_t *&num_points_in_all_processor_parts);
1349 
1354  void fill_permutation_array(
1355  mj_part_t output_num_parts,
1356  mj_part_t num_parts);
1357 
1379  void create_consistent_chunks(
1380  mj_part_t num_parts,
1381  Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1382  Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
1383  mj_lno_t coordinate_begin,
1384  mj_lno_t coordinate_end,
1385  Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
1386  Kokkos::View<mj_lno_t *, device_t> & out_part_xadj,
1387  int coordInd,
1388  bool longest_dim_part,
1389  uSignedSortItem<int, mj_scalar_t, char> *p_coord_dimension_range_sorted);
1390 
1399  void set_final_parts(
1400  mj_part_t current_num_parts,
1401  mj_part_t output_part_begin_index,
1402  RCP<mj_partBoxVector_t> &output_part_boxes,
1403  bool is_data_ever_migrated);
1404 };
1405 
1408 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
1409  typename mj_part_t, typename mj_node_t>
1411  mj_env(), mj_problemComm(), comm(), imbalance_tolerance(0),
1412  recursion_depth(0), coord_dim(0),
1413  num_weights_per_coord(0), initial_num_loc_coords(0),
1414  initial_num_glob_coords(0),
1415  num_local_coords(0), num_global_coords(0),
1416  sEpsilon(std::numeric_limits<mj_scalar_t>::epsilon() * 100),
1417  distribute_points_on_cut_lines(true),
1418  max_concurrent_part_calculation(1),
1419  mj_run_as_rcb(false), mj_user_recursion_depth(0),
1420  mj_keep_part_boxes(false),
1421  check_migrate_avoid_migration_option(0), migration_type(0),
1422  minimum_migration_imbalance(0.30),
1423  num_first_level_parts(1),
1424  total_num_cut(0), total_num_part(0), max_num_part_along_dim(0),
1425  max_num_cut_along_dim(0),
1426  max_num_total_part_along_dim(0),
1427  total_dim_num_reduce_all(0),
1428  last_dim_num_part(0),
1429  mj_num_teams(0),
1430  num_global_parts(1),
1431  kept_boxes(), global_box(),
1432  myRank(0), myActualRank(0),
1433  divide_to_prime_first(false)
1434 {
1435 }
1436 
1480 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
1481  typename mj_part_t, typename mj_node_t>
1484  const RCP<const Environment> &env,
1485  mj_lno_t num_total_coords,
1486  mj_lno_t num_selected_coords,
1487  size_t num_target_part,
1488  int coord_dim_,
1489  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
1490  Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t> &
1491  mj_coordinates_,
1492  Kokkos::View<mj_lno_t *, device_t> & initial_adjList_output_adjlist,
1493  mj_lno_t *output_xadj,
1494  int recursion_depth_,
1495  const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & part_no_array_,
1496  bool partition_along_longest_dim,
1497  int num_ranks_per_node,
1498  bool divide_to_prime_first_,
1499  mj_part_t num_first_level_parts_,
1500  const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & first_level_distribution_)
1501 {
1502  this->mj_env = env;
1503  const RCP<Comm<int> > commN;
1504  this->mj_problemComm = Teuchos::DefaultComm<int>::getDefaultSerialComm(commN);
1505  this->comm = Teuchos::rcp_const_cast<Comm<int> >(this->mj_problemComm);
1506  this->myActualRank = this->myRank = 1;
1507 
1508  this->divide_to_prime_first = divide_to_prime_first_;
1509  //weights are uniform for task mapping
1510 
1511  //parts are uniform for task mapping
1512  //as input indices.
1513  this->imbalance_tolerance = 0;
1514  this->num_global_parts = num_target_part;
1515  this->part_no_array = part_no_array_;
1516  this->recursion_depth = recursion_depth_;
1517 
1518  // If nonuniform first level partitioning, the requested num of parts and the
1519  // requested distribution of elements for each part
1520  this->num_first_level_parts = num_first_level_parts_;
1521 
1522  this->first_level_distribution = first_level_distribution_;
1523 
1524  this->coord_dim = coord_dim_;
1525  this->num_local_coords = num_total_coords;
1526 
1527  this->num_global_coords = num_total_coords;
1528  this->mj_coordinates = mj_coordinates_;
1529 
1530 
1531  this->initial_mj_gnos =
1532  Kokkos::View<mj_gno_t*, device_t>("gids", this->num_local_coords);
1533 
1534  this->num_weights_per_coord = 0;
1535 
1536  this->mj_uniform_weights = Kokkos::View<bool*, Kokkos::HostSpace>(
1537  "uniform weights", 1);
1538  this->mj_uniform_weights(0) = true;
1539 
1540  this->mj_weights = Kokkos::View<mj_scalar_t**, device_t>
1541  ("weights", 1, 1);
1542 
1543  this->mj_uniform_parts =
1544  Kokkos::View<bool*, Kokkos::HostSpace>("uniform parts", 1);
1545  this->mj_uniform_parts(0) = true;
1546 
1547  this->set_part_specifications();
1548 
1549  this->allocate_set_work_memory();
1550 
1551  // Do single init
1552  auto local_part_xadj = this->part_xadj;
1553  Kokkos::parallel_for(
1554  Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
1555  KOKKOS_LAMBDA (int dummy) {
1556  local_part_xadj(0) = static_cast<mj_lno_t>(num_selected_coords);
1557  });
1558 
1559  Kokkos::deep_copy(coordinate_permutations, initial_adjList_output_adjlist);
1560 
1561  mj_part_t current_num_parts = 1;
1562 
1563  Kokkos::View<mj_scalar_t *, device_t> current_cut_coordinates =
1564  this->all_cut_coordinates;
1565 
1566  mj_part_t future_num_parts = this->total_num_part;
1567 
1568  std::vector<mj_part_t> *future_num_part_in_parts =
1569  new std::vector<mj_part_t>();
1570  std::vector<mj_part_t> *next_future_num_parts_in_parts =
1571  new std::vector<mj_part_t>();
1572  next_future_num_parts_in_parts->push_back(this->num_global_parts);
1573  RCP<mj_partBoxVector_t> t1;
1574  RCP<mj_partBoxVector_t> t2;
1575 
1576  std::vector <uSignedSortItem<int, mj_scalar_t, char>>
1577  coord_dimension_range_sorted(this->coord_dim);
1578  uSignedSortItem<int, mj_scalar_t, char> *p_coord_dimension_range_sorted =
1579  &(coord_dimension_range_sorted[0]);
1580  std::vector <mj_scalar_t> coord_dim_mins(this->coord_dim);
1581  std::vector <mj_scalar_t> coord_dim_maxs(this->coord_dim);
1582 
1583  // Need a device counter - how best to allocate?
1584  // Putting this allocation in the loops is very costly so moved out here.
1585  Kokkos::View<mj_part_t*, device_t>
1586  view_rectilinear_cut_count("view_rectilinear_cut_count", 1);
1587  Kokkos::View<size_t*, device_t>
1588  view_total_reduction_size("view_total_reduction_size", 1);
1589 
1590  for(int rd = 0; rd < this->recursion_depth; ++rd) {
1591  // next_future_num_parts_in_parts will be as the size of outnumParts,
1592  // and this will hold how many more parts that each output part
1593  // should be divided. this array will also be used to determine the weight
1594  // ratios of the parts.
1595  // swap the arrays to use iteratively..
1596  std::vector<mj_part_t> *tmpPartVect = future_num_part_in_parts;
1597  future_num_part_in_parts = next_future_num_parts_in_parts;
1598  next_future_num_parts_in_parts = tmpPartVect;
1599 
1600  // clear next_future_num_parts_in_parts array as
1601  // getPartitionArrays expects it to be empty.
1602  next_future_num_parts_in_parts->clear();
1603 
1604  // returns the total number of output parts for this dimension partitioning.
1605  mj_part_t output_part_count_in_dimension =
1606  this->update_part_num_arrays(
1607  future_num_part_in_parts,
1608  next_future_num_parts_in_parts,
1609  future_num_parts,
1610  current_num_parts,
1611  rd,
1612  t1,
1613  t2, num_ranks_per_node);
1614 
1615  // if the number of obtained parts equal to current number of parts,
1616  // skip this dimension. For example, this happens when 1 is given in
1617  // the input part array is given. P=4,5,1,2
1618  if(output_part_count_in_dimension == current_num_parts) {
1619  tmpPartVect = future_num_part_in_parts;
1620  future_num_part_in_parts = next_future_num_parts_in_parts;
1621  next_future_num_parts_in_parts = tmpPartVect;
1622  continue;
1623  }
1624 
1625  //convert i to string to be used for debugging purposes.
1626  std::string istring = std::to_string(rd);
1627 
1628  // alloc Memory to point the indices
1629  // of the parts in the permutation array.
1630  this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
1631  "new part xadj", output_part_count_in_dimension);
1632 
1633  // the index where in the outtotalCounts will be written.
1634 
1635  mj_part_t output_part_index = 0;
1636 
1637  // whatever is written to outTotalCounts will be added with previousEnd
1638  // so that the points will be shifted.
1639  mj_part_t output_coordinate_end_index = 0;
1640 
1641  mj_part_t current_work_part = 0;
1642  mj_part_t current_concurrent_num_parts = 1;
1643 
1644  mj_part_t obtained_part_index = 0;
1645 
1646  // get the coordinate axis along which the partitioning will be done.
1647  int coordInd = rd % this->coord_dim;
1648 
1649  Kokkos::View<mj_scalar_t *, device_t> mj_current_dim_coords =
1650  Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
1651 
1652  auto host_process_local_min_max_coord_total_weight =
1653  Kokkos::create_mirror_view(process_local_min_max_coord_total_weight);
1654  auto host_global_min_max_coord_total_weight =
1655  Kokkos::create_mirror_view(global_min_max_coord_total_weight);
1656 
1657  // run for all available parts.
1658  for(; current_work_part < current_num_parts;
1659  current_work_part += current_concurrent_num_parts) {
1660 
1661  mj_part_t actual_work_part_count = 0;
1662 
1663  // initialization for 1D partitioning.
1664  // get the min and max coordinates of each part
1665  // together with the part weights of each part.
1666  for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
1667  mj_part_t current_work_part_in_concurrent_parts =
1668  current_work_part + kk;
1669 
1670  // if this part wont be partitioned any further
1671  // dont do any work for this part.
1672  mj_part_t partition_count = host_num_partitioning_in_current_dim(
1673  current_work_part_in_concurrent_parts);
1674  if(partition_count == 1) {
1675  continue;
1676  }
1677  ++actual_work_part_count;
1678  if(partition_along_longest_dim) {
1679  auto local_process_local_min_max_coord_total_weight =
1680  this->process_local_min_max_coord_total_weight;
1681  for(int coord_traverse_ind = 0;
1682  coord_traverse_ind < this->coord_dim; ++coord_traverse_ind) {
1683 
1684  Kokkos::View<mj_scalar_t *, device_t> coords =
1685  Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coord_traverse_ind);
1686 
1687  this->mj_get_local_min_max_coord_totW(
1688  current_work_part,
1689  current_concurrent_num_parts,
1690  coords);
1691 
1692  coord_dimension_range_sorted[coord_traverse_ind].id =
1693  coord_traverse_ind;
1694  coord_dimension_range_sorted[coord_traverse_ind].signbit = 1;
1695 
1696  Kokkos::deep_copy(host_process_local_min_max_coord_total_weight,
1697  process_local_min_max_coord_total_weight);
1698 
1699  coord_dim_mins[coord_traverse_ind] =
1700  host_process_local_min_max_coord_total_weight(kk);
1701  coord_dim_maxs[coord_traverse_ind] =
1702  host_process_local_min_max_coord_total_weight(
1703  kk + current_concurrent_num_parts);
1704  coord_dimension_range_sorted[coord_traverse_ind].val =
1705  host_process_local_min_max_coord_total_weight(
1706  kk + current_concurrent_num_parts) -
1707  host_process_local_min_max_coord_total_weight(kk);
1708  }
1709 
1710  uqSignsort(this->coord_dim, p_coord_dimension_range_sorted);
1711  coordInd = p_coord_dimension_range_sorted[this->coord_dim - 1].id;
1712  auto set_min = coord_dim_mins[coordInd];
1713  auto set_max = coord_dim_maxs[coordInd];
1714  Kokkos::parallel_for(
1715  Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
1716  (0, 1), KOKKOS_LAMBDA (int dummy) {
1717  local_process_local_min_max_coord_total_weight(kk) = set_min;
1718  local_process_local_min_max_coord_total_weight(
1719  kk + current_concurrent_num_parts) = set_max;
1720  });
1721 
1722  mj_current_dim_coords =
1723  Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
1724  }
1725  else {
1726  Kokkos::View<mj_scalar_t *, device_t> coords =
1727  Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
1728  this->mj_get_local_min_max_coord_totW(
1729  current_work_part,
1730  current_concurrent_num_parts,
1731  coords);
1732  }
1733  }
1734 
1735  // 1D partitioning
1736  if(actual_work_part_count > 0) {
1737  // obtain global Min max of the part.
1738  this->mj_get_global_min_max_coord_totW(
1739  current_concurrent_num_parts,
1740  this->process_local_min_max_coord_total_weight,
1741  this->global_min_max_coord_total_weight);
1742 
1743  // update host copy
1744  Kokkos::deep_copy(host_global_min_max_coord_total_weight,
1745  global_min_max_coord_total_weight);
1746 
1747  // represents the total number of cutlines
1748  // whose coordinate should be determined.
1749  mj_part_t total_incomplete_cut_count = 0;
1750 
1751  //Compute weight ratios for parts & cuts:
1752  //e.g., 0.25 0.25 0.5 0.5 0.75 0.75 1.0
1753  // part0 cut0 part1 cut1 part2 cut2 part3
1754  mj_part_t concurrent_part_cut_shift = 0;
1755  mj_part_t concurrent_part_part_shift = 0;
1756  for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
1757  mj_scalar_t min_coordinate =
1758  host_global_min_max_coord_total_weight(kk);
1759  mj_scalar_t max_coordinate = host_global_min_max_coord_total_weight(
1760  kk + current_concurrent_num_parts);
1761  mj_scalar_t global_total_weight = host_global_min_max_coord_total_weight(
1762  kk + 2*current_concurrent_num_parts);
1763 
1764  mj_part_t concurrent_current_part_index = current_work_part + kk;
1765 
1766  mj_part_t partition_count = host_num_partitioning_in_current_dim(
1767  concurrent_current_part_index);
1768 
1769  Kokkos::View<mj_scalar_t *, device_t> usedCutCoordinate =
1770  Kokkos::subview(current_cut_coordinates,
1771  std::pair<mj_lno_t, mj_lno_t>(
1772  concurrent_part_cut_shift,
1773  current_cut_coordinates.size()));
1774  Kokkos::View<mj_scalar_t *, device_t>
1775  current_target_part_weights =
1776  Kokkos::subview(target_part_weights,
1777  std::pair<mj_lno_t, mj_lno_t>(
1778  concurrent_part_part_shift,
1779  target_part_weights.size()));
1780 
1781  // shift the usedCutCoordinate array as noCuts.
1782  concurrent_part_cut_shift += partition_count - 1;
1783  // shift the partRatio array as noParts.
1784  concurrent_part_part_shift += partition_count;
1785  // calculate only if part is not empty,
1786  // and part will be further partitioend.
1787  if(partition_count > 1 && min_coordinate <= max_coordinate) {
1788  // increase allDone by the number of cuts of the current
1789  // part's cut line number.
1790  total_incomplete_cut_count += partition_count - 1;
1791 
1792  this->incomplete_cut_count(kk) = partition_count - 1;
1793 
1794  // When num_first_level_parts != 1 we have
1795  // nonuniform partitioning on the first level, providing
1796  // requested number of parts (num_first_level_parts) and
1797  // requested distribution in parts (first_level_distribution)
1798 
1799  // Get the target part weights given a desired distribution
1800  this->mj_get_initial_cut_coords_target_weights(
1801  min_coordinate,
1802  max_coordinate,
1803  partition_count - 1,
1804  global_total_weight,
1805  usedCutCoordinate,
1806  current_target_part_weights,
1807  future_num_part_in_parts,
1808  next_future_num_parts_in_parts,
1809  concurrent_current_part_index,
1810  obtained_part_index,
1811  rd == 0 ? this->num_first_level_parts : 1,
1812  this->first_level_distribution);
1813 
1814  mj_lno_t coordinate_end_index =
1815  host_part_xadj(concurrent_current_part_index);
1816  mj_lno_t coordinate_begin_index =
1817  (concurrent_current_part_index==0) ? 0 :
1818  host_part_xadj[concurrent_current_part_index - 1];
1819 
1820  // get the initial estimated part assignments of the coordinates.
1821  this->set_initial_coordinate_parts(
1822  max_coordinate,
1823  min_coordinate,
1824  coordinate_begin_index, coordinate_end_index,
1825  this->coordinate_permutations,
1826  mj_current_dim_coords,
1827  this->assigned_part_ids,
1828  partition_count);
1829  }
1830  else {
1831  // e.g., if have fewer coordinates than parts, don't need to do
1832  // next dim.
1833  this->incomplete_cut_count(kk) = 0;
1834  }
1835  obtained_part_index += partition_count;
1836  }
1837 
1838  // used imbalance, it is always 0, as it is difficult
1839  // to estimate a range.
1840  double used_imbalance = 0;
1841 
1842  // Determine cut lines for k parts here.
1843  this->mj_env->timerStart(MACRO_TIMERS,
1844  mj_timer_base_string + "mj_1D_part()");
1845 
1846  this->mj_1D_part(
1847  mj_current_dim_coords,
1848  used_imbalance,
1849  current_work_part,
1850  current_concurrent_num_parts,
1851  current_cut_coordinates,
1852  total_incomplete_cut_count,
1853  view_rectilinear_cut_count,
1854  view_total_reduction_size);
1855 
1856  this->mj_env->timerStop(MACRO_TIMERS,
1857  mj_timer_base_string + "mj_1D_part()");
1858  }
1859  else {
1860  obtained_part_index += current_concurrent_num_parts;
1861  }
1862  // create part chunks
1863  {
1864  mj_part_t output_array_shift = 0;
1865  mj_part_t cut_shift = 0;
1866  size_t tlr_shift = 0;
1867  size_t partweight_array_shift = 0;
1868 
1869  for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
1870  mj_part_t current_concurrent_work_part = current_work_part + kk;
1871 
1872  mj_part_t num_parts = host_num_partitioning_in_current_dim(
1873  current_concurrent_work_part);
1874 
1875  // if the part is empty, skip the part.
1876  int coordinateA_bigger_than_coordinateB =
1877  host_global_min_max_coord_total_weight(kk) >
1878  host_global_min_max_coord_total_weight(
1879  kk + current_concurrent_num_parts);
1880 
1881  if((num_parts != 1) && coordinateA_bigger_than_coordinateB) {
1882  // we still need to write the begin and end point of the empty part.
1883  // simply set it zero, the array indices will be shifted later
1884  auto local_new_part_xadj = this->new_part_xadj;
1885  Kokkos::parallel_for(
1886  Kokkos::RangePolicy<typename mj_node_t::execution_space,
1887  mj_part_t> (0, num_parts), KOKKOS_LAMBDA(mj_part_t jj) {
1888  local_new_part_xadj(
1889  output_part_index + output_array_shift + jj) = 0;
1890  });
1891 
1892  cut_shift += num_parts - 1;
1893  tlr_shift += (4 *(num_parts - 1) + 1);
1894  output_array_shift += num_parts;
1895  partweight_array_shift += (2 * (num_parts - 1) + 1);
1896  continue;
1897  }
1898  mj_lno_t coordinate_end =
1899  host_part_xadj(current_concurrent_work_part);
1900  mj_lno_t coordinate_begin =
1901  current_concurrent_work_part==0 ? 0 :
1902  host_part_xadj(current_concurrent_work_part-1);
1903 
1904  Kokkos::View<mj_scalar_t *, device_t>
1905  current_concurrent_cut_coordinate =
1906  Kokkos::subview(current_cut_coordinates,
1907  std::pair<mj_lno_t, mj_lno_t>(
1908  cut_shift,
1909  current_cut_coordinates.size()));
1910  Kokkos::View<mj_scalar_t *, device_t>
1911  used_local_cut_line_weight_to_left =
1912  Kokkos::subview(process_cut_line_weight_to_put_left,
1913  std::pair<mj_lno_t, mj_lno_t>(
1914  cut_shift,
1915  process_cut_line_weight_to_put_left.size()));
1916 
1917  this->thread_part_weight_work =
1918  Kokkos::subview(
1919  this->thread_part_weights,
1920  std::pair<mj_lno_t, mj_lno_t>(
1921  partweight_array_shift,
1922  this->thread_part_weights.size()));
1923 
1924  if(num_parts > 1) {
1925  // Rewrite the indices based on the computed cuts.
1926  Kokkos::View<mj_lno_t *, device_t> subview_new_part_xadj =
1927  Kokkos::subview(this->new_part_xadj,
1928  std::pair<mj_lno_t, mj_lno_t>(
1929  output_part_index + output_array_shift,
1930  this->new_part_xadj.size()));
1931 
1932  this->create_consistent_chunks(
1933  num_parts,
1934  mj_current_dim_coords,
1935  current_concurrent_cut_coordinate,
1936  coordinate_begin,
1937  coordinate_end,
1938  used_local_cut_line_weight_to_left,
1939  subview_new_part_xadj,
1940  coordInd,
1941  partition_along_longest_dim,
1942  p_coord_dimension_range_sorted);
1943  }
1944  else {
1945  // if this part is partitioned into 1 then just copy
1946  // the old values.
1947  mj_lno_t part_size = coordinate_end - coordinate_begin;
1948 
1949  auto local_new_part_xadj = this->new_part_xadj;
1950  Kokkos::parallel_for(
1951  Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
1952  (0, 1), KOKKOS_LAMBDA (int dummy) {
1953  local_new_part_xadj(output_part_index + output_array_shift)
1954  = part_size;
1955  });
1956 
1957  auto subview_new_coordinate_permutations =
1958  Kokkos::subview(this->new_coordinate_permutations,
1959  std::pair<mj_lno_t, mj_lno_t>(
1960  coordinate_begin,
1961  coordinate_begin + part_size));
1962  auto subview_coordinate_permutations =
1963  Kokkos::subview(this->coordinate_permutations,
1964  std::pair<mj_lno_t, mj_lno_t>(
1965  coordinate_begin,
1966  coordinate_begin + part_size));
1967  Kokkos::deep_copy(subview_new_coordinate_permutations,
1968  subview_coordinate_permutations);
1969  }
1970 
1971  cut_shift += num_parts - 1;
1972  tlr_shift += (4 *(num_parts - 1) + 1);
1973  output_array_shift += num_parts;
1974  partweight_array_shift += (2 * (num_parts - 1) + 1);
1975  }
1976 
1977  // shift cut coordinates so that all cut coordinates are stored.
1978  // current_cut_coordinates += cutShift;
1979 
1980  // getChunks from coordinates partitioned the parts and
1981  // wrote the indices as if there were a single part.
1982  // now we need to shift the beginning indices.
1983  for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
1984  mj_part_t num_parts =
1985  host_num_partitioning_in_current_dim(current_work_part + kk);
1986  auto local_new_part_xadj = this->new_part_xadj;
1987  auto local_mj_current_dim_coords = mj_current_dim_coords;
1988  auto local_new_coordinate_permutations =
1989  new_coordinate_permutations;
1990  Kokkos::parallel_for(
1991  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t> (
1992  0, num_parts), KOKKOS_LAMBDA (mj_part_t ii) {
1993  //shift it by previousCount
1994  local_new_part_xadj(output_part_index+ii) +=
1995  output_coordinate_end_index;
1996 
1997  if(ii % 2 == 1) {
1998  mj_lno_t coordinate_end =
1999  local_new_part_xadj(output_part_index+ii);
2000  mj_lno_t coordinate_begin =
2001  local_new_part_xadj(output_part_index);
2002 
2003  for(mj_lno_t task_traverse = coordinate_begin;
2004  task_traverse < coordinate_end; ++task_traverse) {
2005  mj_lno_t l = local_new_coordinate_permutations(task_traverse);
2006  //MARKER: FLIPPED ZORDER BELOW
2007  local_mj_current_dim_coords(l) = -local_mj_current_dim_coords(l);
2008  }
2009  }
2010  });
2011 
2012  // increase the previous count by current end.
2013  mj_part_t get_single;
2014  Kokkos::parallel_reduce("Read new_part_xadj",
2015  Kokkos::RangePolicy<typename mj_node_t::execution_space, int>(0, 1),
2016  KOKKOS_LAMBDA(int dummy, mj_part_t & set_single) {
2017  set_single = local_new_part_xadj(output_part_index + num_parts - 1);
2018  }, get_single);;
2019 
2020  output_coordinate_end_index = get_single;
2021  // increase the current out.
2022  output_part_index += num_parts;
2023  }
2024  }
2025  }
2026 
2027  // end of this partitioning dimension
2028  // set the current num parts for next dim partitioning
2029  current_num_parts = output_part_count_in_dimension;
2030 
2031  //swap the coordinate permutations for the next dimension.
2032  Kokkos::View<mj_lno_t *, device_t> tmp = this->coordinate_permutations;
2033  this->coordinate_permutations = this->new_coordinate_permutations;
2034  this->new_coordinate_permutations = tmp;
2035 
2036  this->part_xadj = this->new_part_xadj;
2037  this->host_part_xadj = Kokkos::create_mirror_view(part_xadj);
2038  Kokkos::deep_copy(host_part_xadj, part_xadj); // keep in sync
2039  this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>("empty", 0);
2040  }
2041 
2042  Kokkos::deep_copy(initial_adjList_output_adjlist, coordinate_permutations);
2043 
2044  // Return output_xadj in CSR format
2045  output_xadj[0] = 0;
2046  for(size_t i = 0; i < this->num_global_parts ; ++i) {
2047  output_xadj[i+1] = host_part_xadj(i);
2048  }
2049 
2050  delete future_num_part_in_parts;
2051  delete next_future_num_parts_in_parts;
2052 }
2053 
2057 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2058  typename mj_part_t, typename mj_node_t>
2059 RCP<typename AlgMJ
2060  <mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t,mj_node_t>::mj_partBox_t>
2063 {
2064  return this->global_box;
2065 }
2066 
2069 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2070  typename mj_part_t, typename mj_node_t>
2071 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
2072  mj_node_t>::set_to_keep_part_boxes()
2073 {
2074  this->mj_keep_part_boxes = true;
2075 }
2076 
2077 /* \brief Either the mj array (part_no_array) or num_global_parts should be
2078  * provided in the input. part_no_array takes
2079  * precedence if both are provided.
2080  * Depending on these parameters, total cut/part number,
2081  * maximum part/cut number along a dimension, estimated number of reduceAlls,
2082  * and the number of parts before the last dimension is calculated.
2083  * */
2084 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2085  typename mj_part_t, typename mj_node_t>
2088 {
2089  this->total_num_cut = 0; //how many cuts will be totally
2090  this->total_num_part = 1; //how many parts will be totally
2091  this->max_num_part_along_dim = 0; // maximum part count along a dimension.
2092  this->total_dim_num_reduce_all = 0; // estimate on #reduceAlls can be done.
2093  this->last_dim_num_part = 1; //max no of parts that might occur
2094  //during the partition before the
2095  //last partitioning dimension.
2096  this->max_num_cut_along_dim = 0;
2097  this->max_num_total_part_along_dim = 0;
2098 
2099  if(this->part_no_array.size()) {
2100  auto local_recursion_depth = this->recursion_depth;
2101 
2102  this->total_dim_num_reduce_all =
2103  this->total_num_part * this->recursion_depth;
2104 
2105  this->total_num_part = 1;
2106  for(int i = 0; i < local_recursion_depth; ++i) {
2107  this->total_num_part *= this->part_no_array(i);
2108  }
2109 
2110  mj_part_t track_max = 0;
2111  for(int i = 0; i < local_recursion_depth; ++i) {
2112  if(part_no_array(i) > track_max) {
2113  track_max = this->part_no_array(i);
2114  };
2115  }
2116 
2117  this->last_dim_num_part = this->total_num_part /
2118  this->part_no_array(local_recursion_depth-1);
2119 
2120  this->max_num_part_along_dim = track_max;
2121  this->num_global_parts = this->total_num_part;
2122  } else {
2123  mj_part_t future_num_parts = this->num_global_parts;
2124 
2125  // If using nonuniform first level partitioning.
2126  // initial value max_num_part_along_dim == num_first_level_parts
2127  if (this->first_level_distribution.size() != 0 &&
2128  this->num_first_level_parts > 1) {
2129  this->max_num_part_along_dim = this->num_first_level_parts;
2130  }
2131 
2132  // we need to calculate the part numbers now, to determine
2133  // the maximum along the dimensions.
2134  for(int rd = 0; rd < this->recursion_depth; ++rd) {
2135  mj_part_t maxNoPartAlongI = 0;
2136  mj_part_t nfutureNumParts = 0;
2137 
2138  // Nonuniform first level partitioning sets part specificiations for
2139  // rd == 0 only, given requested num of parts and distribution in parts
2140  // for the first level.
2141  if (rd == 0 &&
2142  this->first_level_distribution.size() != 0 &&
2143  this->num_first_level_parts > 1) {
2144 
2145  maxNoPartAlongI = this->num_first_level_parts;
2146  this->max_num_part_along_dim = this->num_first_level_parts;
2147 
2148  mj_part_t sum_first_level_dist = 0;
2149  mj_part_t max_part = 0;
2150 
2151  // Cumulative sum of distribution of parts and size of largest part
2152  for (int i = 0; i < this->num_first_level_parts; ++i) {
2153  sum_first_level_dist += this->first_level_distribution(i);
2154  if (this->first_level_distribution(i) > max_part)
2155  max_part = this->first_level_distribution(i);
2156  }
2157 
2158  // Total parts in largest nonuniform superpart from
2159  // first level partitioning
2160  nfutureNumParts =
2161  this->num_global_parts * max_part / sum_first_level_dist;
2162  }
2163  // Standard uniform partitioning this level
2164  else {
2165  maxNoPartAlongI = this->get_part_count(future_num_parts,
2166  1.0f / (this->recursion_depth - rd));
2167  if (maxNoPartAlongI > this->max_num_part_along_dim)
2168  this->max_num_part_along_dim = maxNoPartAlongI;
2169  nfutureNumParts = future_num_parts / maxNoPartAlongI;
2170  if (future_num_parts % maxNoPartAlongI) {
2171  ++nfutureNumParts;
2172  }
2173  }
2174  future_num_parts = nfutureNumParts;
2175  }
2176  this->total_num_part = this->num_global_parts;
2177 
2178  if(this->divide_to_prime_first) {
2179  this->total_dim_num_reduce_all = this->num_global_parts * 2;
2180  this->last_dim_num_part = this->num_global_parts;
2181  }
2182  else {
2183  //this is the lower bound.
2184  //estimate reduceAll Count here.
2185  //we find the upperbound instead.
2186  size_t p = 1;
2187  for(int i = 0; i < this->recursion_depth; ++i) {
2188  this->total_dim_num_reduce_all += p;
2189  p *= this->max_num_part_along_dim;
2190  }
2191 
2192  if(p / this->max_num_part_along_dim > this->num_global_parts) {
2193  this->last_dim_num_part = this->num_global_parts;
2194  }
2195  else {
2196  this->last_dim_num_part = p / this->max_num_part_along_dim;
2197  }
2198  }
2199  }
2200 
2201  this->total_num_cut = this->total_num_part - 1;
2202  this->max_num_cut_along_dim = this->max_num_part_along_dim - 1;
2203  this->max_num_total_part_along_dim = this->max_num_part_along_dim +
2204  size_t(this->max_num_cut_along_dim);
2205  // maxPartNo is P, maxCutNo = P-1, matTotalPartcount = 2P-1
2206 
2207  // refine the concurrent part count, if it is given bigger than the maximum
2208  // possible part count.
2209  if(this->max_concurrent_part_calculation > this->last_dim_num_part) {
2210  if(this->mj_problemComm->getRank() == 0) {
2211  std::cerr << "Warning: Concurrent part count (" <<
2212  this->max_concurrent_part_calculation <<
2213  ") has been set bigger than maximum amount that can be used." <<
2214  " Setting to:" << this->last_dim_num_part << "." << std::endl;
2215  }
2216  this->max_concurrent_part_calculation = this->last_dim_num_part;
2217  }
2218 }
2219 
2220 /* \brief Tries to determine the part number for current dimension,
2221  * by trying to make the partitioning as square as possible.
2222  * \param num_total_future how many more partitionings are required.
2223  * \param root how many more recursion depth is left.
2224  */
2225 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2226  typename mj_part_t, typename mj_node_t>
2227 inline mj_part_t AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
2228  get_part_count(mj_part_t num_total_future, double root)
2229 {
2230  double fp = pow(num_total_future, root);
2231  mj_part_t ip = mj_part_t(fp);
2232  if(fp - ip < std::numeric_limits<float>::epsilon() * 100) {
2233  return ip;
2234  }
2235  else {
2236  return ip + 1;
2237  }
2238 }
2239 
2240 /* \brief Function returns how many parts that will be obtained after this
2241  * dimension partitioning. It sets how many parts each current part will be
2242  * partitioned into in this dimension to device_num_partitioning_in_current_dim
2243  * view, sets how many total future parts each obtained part will be
2244  * partitioned into in next_future_num_parts_in_parts vector. If part boxes are
2245  * kept, then sets initializes the output_part_boxes as its ancestor.
2246  * \param future_num_part_in_parts: input, how many future parts each current
2247  * part will be partitioned into.
2248  * \param next_future_num_parts_in_parts: output, how many future parts each
2249  * obtained part will be partitioned into.
2250  * \param future_num_parts: output, max number of future parts that will be
2251  * obtained from a single
2252  * \param current_num_parts: input, how many parts are there currently.
2253  * \param current_iteration: input, current dimension iteration number.
2254  * \param input_part_boxes: input, if boxes are kept, current boxes.
2255  * \param output_part_boxes: output, if boxes are kept, the initial box
2256  * boundaries for obtained parts.
2257  * \param atomic_part_count DOCWORK: Documentation
2258  */
2259 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2260  typename mj_part_t, typename mj_node_t>
2261 mj_part_t AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
2262  update_part_num_arrays(
2263  std::vector<mj_part_t> *future_num_part_in_parts,
2264  std::vector<mj_part_t> *next_future_num_parts_in_parts,
2265  mj_part_t &future_num_parts,
2266  mj_part_t current_num_parts,
2267  int current_iteration,
2268  RCP<mj_partBoxVector_t> input_part_boxes,
2269  RCP<mj_partBoxVector_t> output_part_boxes,
2270  mj_part_t atomic_part_count)
2271 {
2272  std::vector<mj_part_t> num_partitioning_in_current_dim;
2273 
2274  // how many parts that will be obtained after this dimension.
2275  mj_part_t output_num_parts = 0;
2276  if(this->part_no_array.size()) {
2277  // when the partNo array is provided as input,
2278  // each current partition will be partition to the same number of parts.
2279  // we dont need to use the future_num_part_in_parts vector in this case.
2280  mj_part_t current_part_no_array =
2281  this->part_no_array(current_iteration);
2282 
2283  if(current_part_no_array < 1) {
2284  std::cout << "Current recursive iteration: " << current_iteration <<
2285  " part_no_array[" << current_iteration << "] is given as:" <<
2286  current_part_no_array << std::endl;
2287  std::terminate();
2288  }
2289  if(current_part_no_array == 1) {
2290  return current_num_parts;
2291  }
2292 
2293  // If using part_no_array, ensure compatibility with num_first_level_parts.
2294  if (this->first_level_distribution.size() != 0 &&
2295  current_iteration == 0 &&
2296  current_part_no_array != this->num_first_level_parts) {
2297  std::cout << "Current recursive iteration: " << current_iteration
2298  << " part_no_array[" << current_iteration << "] is given as: " <<
2299  current_part_no_array << " and contradicts num_first_level_parts: " <<
2300  this->num_first_level_parts << std::endl;
2301  std::terminate();
2302  }
2303 
2304  for(mj_part_t ii = 0; ii < current_num_parts; ++ii) {
2305  num_partitioning_in_current_dim.push_back(current_part_no_array);
2306  }
2307 
2308 /*
2309  std::cout << "\n\nme: " << this->myRank << " current_iteration: " <<
2310  current_iteration << " current_num_parts: " <<
2311  current_num_parts << "\n\n";
2312 
2313  std::cout << "\n\nnum_partitioning_in_current_dim[0]: " <<
2314  num_partitioning_in_current_dim[0] << "\n\n";
2315 
2316  std::cout << "\n\nfuture_num_parts: " << future_num_parts
2317  << " num_partitioning_in_current_dim[0]: " <<
2318  num_partitioning_in_current_dim[0] << " " <<
2319  future_num_parts / num_partitioning_in_current_dim[0] << "\n\n";
2320 */
2321 
2322  future_num_parts /= num_partitioning_in_current_dim[0];
2323  output_num_parts = current_num_parts *
2324  num_partitioning_in_current_dim[0];
2325  if(this->mj_keep_part_boxes) {
2326  for(mj_part_t k = 0; k < current_num_parts; ++k) {
2327  //initialized the output boxes as its ancestor.
2328  for(mj_part_t j = 0; j <
2329  num_partitioning_in_current_dim[0]; ++j) {
2330  output_part_boxes->push_back((*input_part_boxes)[k]);
2331  }
2332  }
2333  }
2334 
2335  // set the how many more parts each part will be divided.
2336  // this is obvious when partNo array is provided as input.
2337  // however, fill this so weights will be calculated according to this array.
2338  for(mj_part_t ii = 0; ii < output_num_parts; ++ii) {
2339  next_future_num_parts_in_parts->push_back(future_num_parts);
2340  }
2341  }
2342  else {
2343  // if partNo array is not provided as input, future_num_part_in_parts
2344  // holds how many parts each part should be divided. Initially it holds a
2345  // single number equal to the total number of global parts.
2346 
2347  // calculate the future_num_parts from beginning,
2348  // since each part might be divided into different number of parts.
2349  future_num_parts = 1;
2350 
2351  // cout << "i:" << i << std::endl;
2352  for(mj_part_t ii = 0; ii < current_num_parts; ++ii) {
2353  // get how many parts a part should be divided.
2354  mj_part_t future_num_parts_of_part_ii = (*future_num_part_in_parts)[ii];
2355 
2356  // get the ideal number of parts that is close to the
2357  // (recursion_depth - i) root of the future_num_parts_of_part_ii.
2358  mj_part_t num_partitions_in_current_dim =
2359  this->get_part_count(future_num_parts_of_part_ii,
2360  1.0 / (this->recursion_depth - current_iteration)
2361  );
2362  if(num_partitions_in_current_dim > this->max_num_part_along_dim) {
2363  std::cerr << "ERROR: maxPartNo calculation is wrong."
2364  " num_partitions_in_current_dim: "
2365  << num_partitions_in_current_dim << " this->max_num_part_along_dim: "
2366  << this->max_num_part_along_dim <<
2367  " this->recursion_depth: " << this->recursion_depth <<
2368  " current_iteration:" << current_iteration <<
2369  " future_num_parts_of_part_ii: " << future_num_parts_of_part_ii <<
2370  " might need to fix max part no calculation for "
2371  "largest_prime_first partitioning." <<
2372  std::endl;
2373  std::terminate();
2374  }
2375  // add this number to vector_num_partitioning_in_current_dim vector.
2376  // num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
2377  // mj_part_t largest_prime_factor = num_partitions_in_current_dim;
2378 
2379  // Update part num arrays when on current_iteration == 0 and
2380  // using nonuniform first level partitioning
2381  // with requested num parts (num_first_level_parts) and
2382  // a requested distribution in parts (first_level_distribution).
2383  if (current_iteration == 0 &&
2384  this->first_level_distribution.size() != 0 &&
2385  this->num_first_level_parts > 1) {
2386  // Only 1 current part to begin and partitions into
2387  // num_first_level_parts many parts
2388  num_partitioning_in_current_dim.push_back(this->num_first_level_parts);
2389 
2390  // The output number of parts from first level partitioning
2391  output_num_parts = this->num_first_level_parts;
2392 
2393  // Remaining parts left to partition for all future levels
2394  future_num_parts /= this->num_first_level_parts;
2395 
2396  mj_part_t max_part = 0;
2397  mj_part_t sum_first_level_dist = 0;
2398 
2399  // Cumulative sum of distribution of first level parts
2400  // and size of largest first level part
2401  for (int i = 0; i < this->num_first_level_parts; ++i) {
2402  sum_first_level_dist += this->first_level_distribution(i);
2403 
2404  if (this->first_level_distribution(i) > max_part)
2405  max_part = this->first_level_distribution(i);
2406  }
2407 
2408  // Maximum # of remaining parts left to partition for all future levels
2409  future_num_parts = this->num_global_parts * max_part / sum_first_level_dist;
2410 
2411  // Number of parts remaining left to partition for each future_part
2412  // The sum must exactly equal global_num_parts
2413  for (int i = 0; i < this->num_first_level_parts; ++i) {
2414  next_future_num_parts_in_parts->push_back(this->first_level_distribution(i) *
2415  this->num_global_parts / sum_first_level_dist);
2416  }
2417  }
2418  else if (this->divide_to_prime_first) {
2419  // Add this number to num_partitioning_in_current_dim vector.
2420  num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
2421 
2422  mj_part_t largest_prime_factor = num_partitions_in_current_dim;
2423 
2424  //increase the output number of parts.
2425  output_num_parts += num_partitions_in_current_dim;
2426 
2427  if (future_num_parts_of_part_ii == atomic_part_count ||
2428  future_num_parts_of_part_ii % atomic_part_count != 0) {
2429  atomic_part_count = 1;
2430  }
2431 
2432  largest_prime_factor =
2433  this->find_largest_prime_factor(future_num_parts_of_part_ii / atomic_part_count);
2434 
2435  // We divide to num_partitions_in_current_dim. But we adjust the weights
2436  // based on largest prime/ if num_partitions_in_current_dim = 2,
2437  // largest prime = 5 --> we divide to 2 parts with weights 3x and 2x.
2438  // if the largest prime is less than part count, we use the part count
2439  // so that we divide uniformly.
2440  if (largest_prime_factor < num_partitions_in_current_dim) {
2441  largest_prime_factor = num_partitions_in_current_dim;
2442  }
2443  //ideal number of future partitions for each part.
2444  mj_part_t ideal_num_future_parts_in_part =
2445  (future_num_parts_of_part_ii / atomic_part_count) / largest_prime_factor;
2446  //if num_partitions_in_current_dim = 2, largest prime = 5 then ideal weight is 2x
2447  mj_part_t ideal_prime_scale = largest_prime_factor / num_partitions_in_current_dim;
2448 
2449 /*
2450  std::cout << "\ncurrent num part: " << ii
2451  << " largest_prime_factor: " << largest_prime_factor
2452  << " To Partition: " << future_num_parts_of_part_ii << "\n\n";
2453 */
2454 
2455  for (mj_part_t iii = 0; iii < num_partitions_in_current_dim; ++iii) {
2456  //if num_partitions_in_current_dim = 2, largest prime = 5 then ideal weight is 2x
2457  mj_part_t my_ideal_primescale = ideal_prime_scale;
2458  //left over weighs. Left side is adjusted to be 3x, right side stays as 2x
2459  if (iii < (largest_prime_factor) % num_partitions_in_current_dim) {
2460  ++my_ideal_primescale;
2461  }
2462  //scale with 'x';
2463  mj_part_t num_future_parts_for_part_iii =
2464  ideal_num_future_parts_in_part * my_ideal_primescale;
2465 
2466  //if there is a remainder in the part increase the part weight.
2467  if (iii < (future_num_parts_of_part_ii / atomic_part_count) % largest_prime_factor) {
2468  //if not uniform, add 1 for the extra parts.
2469  ++num_future_parts_for_part_iii;
2470  }
2471 
2472  next_future_num_parts_in_parts->push_back(num_future_parts_for_part_iii * atomic_part_count);
2473 
2474  //if part boxes are stored, initialize the box of the parts as the ancestor.
2475  if (this->mj_keep_part_boxes) {
2476  output_part_boxes->push_back((*input_part_boxes)[ii]);
2477  }
2478 
2479  //set num future_num_parts to maximum in this part.
2480  if (num_future_parts_for_part_iii > future_num_parts)
2481  future_num_parts = num_future_parts_for_part_iii;
2482 
2483  }
2484  }
2485  else {
2486  // Add this number to num_partitioning_in_current_dim vector.
2487  num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
2488 
2489  //increase the output number of parts.
2490  output_num_parts += num_partitions_in_current_dim;
2491 
2492  if((future_num_parts_of_part_ii == atomic_part_count) ||
2493  (future_num_parts_of_part_ii % atomic_part_count != 0)) {
2494  atomic_part_count = 1;
2495  }
2496  //ideal number of future partitions for each part.
2497  mj_part_t ideal_num_future_parts_in_part =
2498  (future_num_parts_of_part_ii / atomic_part_count) /
2499  num_partitions_in_current_dim;
2500  for(mj_part_t iii = 0; iii < num_partitions_in_current_dim; ++iii) {
2501  mj_part_t num_future_parts_for_part_iii =
2502  ideal_num_future_parts_in_part;
2503 
2504  //if there is a remainder in the part increase the part weight.
2505  if(iii < (future_num_parts_of_part_ii / atomic_part_count) %
2506  num_partitions_in_current_dim) {
2507  // if not uniform, add 1 for the extra parts.
2508  ++num_future_parts_for_part_iii;
2509  }
2510 
2511  next_future_num_parts_in_parts->push_back(
2512  num_future_parts_for_part_iii * atomic_part_count);
2513 
2514  // if part boxes are stored, initialize the box of the parts as
2515  // the ancestor.
2516  if(this->mj_keep_part_boxes) {
2517  output_part_boxes->push_back((*input_part_boxes)[ii]);
2518  }
2519  //set num future_num_parts to maximum in this part.
2520  if(num_future_parts_for_part_iii > future_num_parts)
2521  future_num_parts = num_future_parts_for_part_iii;
2522  }
2523  }
2524  }
2525  }
2526  // move temp std::vector to host view
2527  device_num_partitioning_in_current_dim = Kokkos::View<
2528  mj_part_t*, device_t>("test", num_partitioning_in_current_dim.size());
2529  host_num_partitioning_in_current_dim =
2530  Kokkos::create_mirror_view(device_num_partitioning_in_current_dim);
2531  for(size_t n = 0; n < num_partitioning_in_current_dim.size(); ++n) {
2532  host_num_partitioning_in_current_dim(n) =
2533  num_partitioning_in_current_dim[n];
2534  }
2535  // setup device equivalent - this data is used on host and device and it's
2536  // more efficient to just setup array on both sides now rather than copy
2537  // values as needed later.
2538  Kokkos::deep_copy(device_num_partitioning_in_current_dim,
2539  host_num_partitioning_in_current_dim);
2540  return output_num_parts;
2541 }
2542 
2543 /* \brief Allocates and initializes the work memory that will be used by MJ.
2544  * */
2545 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2546  typename mj_part_t, typename mj_node_t>
2547 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
2548  allocate_set_work_memory()
2549 {
2550  // Throughout the partitioning execution,
2551  // instead of the moving the coordinates, hold a permutation array for parts.
2552  // coordinate_permutations holds the current permutation.
2553  this->coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>(
2554  Kokkos::ViewAllocateWithoutInitializing("coordinate_permutations"),
2555  this->num_local_coords);
2556  auto local_coordinate_permutations = coordinate_permutations;
2557  Kokkos::parallel_for(
2558  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t> (
2559  0, this->num_local_coords), KOKKOS_LAMBDA (mj_lno_t i) {
2560  local_coordinate_permutations(i) = i;
2561  });
2562 
2563  // new_coordinate_permutations holds the current permutation.
2564  this->new_coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>(
2565  Kokkos::ViewAllocateWithoutInitializing("num_local_coords"),
2566  this->num_local_coords);
2567 
2568  this->assigned_part_ids = Kokkos::View<mj_part_t*, device_t>(
2569  Kokkos::ViewAllocateWithoutInitializing("assigned parts"), 0);
2570  if(this->num_local_coords > 0) {
2571  this->assigned_part_ids = Kokkos::View<mj_part_t*, device_t>(
2572  Kokkos::ViewAllocateWithoutInitializing("assigned part ids"),
2573  this->num_local_coords);
2574  }
2575 
2576  // single partition starts at index-0, and ends at numLocalCoords
2577  // inTotalCounts array holds the end points in coordinate_permutations array
2578  // for each partition. Initially sized 1, and single element is set to
2579  // numLocalCoords.
2580  this->part_xadj = Kokkos::View<mj_lno_t*, device_t>(
2581  Kokkos::ViewAllocateWithoutInitializing("part xadj"), 1);
2582  this->host_part_xadj = Kokkos::create_mirror_view(part_xadj);
2583  host_part_xadj(0) = num_local_coords;
2584  Kokkos::deep_copy(this->part_xadj, host_part_xadj);
2585 
2586  // the ends points of the output, this is allocated later.
2587  this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
2588  Kokkos::ViewAllocateWithoutInitializing("empty"), 0);
2589 
2590  // only store this much if cuts are needed to be stored.
2591  this->all_cut_coordinates = Kokkos::View<mj_scalar_t*, device_t>(
2592  Kokkos::ViewAllocateWithoutInitializing("all cut coordinates"),
2593  this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2594 
2595  // how much weight percentage should a MPI put left side of the each cutline
2596  this->process_cut_line_weight_to_put_left = Kokkos::View<mj_scalar_t*,
2597  device_t>(Kokkos::ViewAllocateWithoutInitializing("empty"), 0);
2598 
2599  // how much weight percentage should each thread in MPI put left side of
2600  // each outline
2601  this->thread_cut_line_weight_to_put_left =
2602  Kokkos::View<mj_scalar_t*, device_t>(
2603  Kokkos::ViewAllocateWithoutInitializing("empty"), 0);
2604 
2605  if(this->distribute_points_on_cut_lines) {
2606  this->process_cut_line_weight_to_put_left =
2607  Kokkos::View<mj_scalar_t *, device_t>(
2608  Kokkos::ViewAllocateWithoutInitializing(
2609  "process_cut_line_weight_to_put_left"),
2610  this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2611  this->thread_cut_line_weight_to_put_left =
2612  Kokkos::View<mj_scalar_t *, device_t>(
2613  Kokkos::ViewAllocateWithoutInitializing(
2614  "thread_cut_line_weight_to_put_left"),
2615  this->max_num_cut_along_dim);
2616  this->process_rectilinear_cut_weight =
2617  Kokkos::View<mj_scalar_t *, device_t>(
2618  Kokkos::ViewAllocateWithoutInitializing("process_rectilinear_cut_weight"),
2619  this->max_num_cut_along_dim);
2620  this->global_rectilinear_cut_weight =
2621  Kokkos::View<mj_scalar_t *, device_t>(
2622  Kokkos::ViewAllocateWithoutInitializing("global_rectilinear_cut_weight"),
2623  this->max_num_cut_along_dim);
2624  }
2625 
2626  // work array to manipulate coordinate of cutlines in different iterations.
2627  // necessary because previous cut line information is used for determining
2628  // the next cutline information. therefore, cannot update the cut work array
2629  // until all cutlines are determined.
2630  this->cut_coordinates_work_array =
2631  Kokkos::View<mj_scalar_t *, device_t>(
2632  Kokkos::ViewAllocateWithoutInitializing("cut_coordinates_work_array"),
2633  this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2634 
2635  // cumulative part weight array.
2636  this->target_part_weights = Kokkos::View<mj_scalar_t*, device_t>(
2637  Kokkos::ViewAllocateWithoutInitializing("target_part_weights"),
2638  this->max_num_part_along_dim * this->max_concurrent_part_calculation);
2639 
2640  // upper bound coordinate of a cut line
2641  this->cut_upper_bound_coordinates =
2642  Kokkos::View<mj_scalar_t*, device_t>(
2643  Kokkos::ViewAllocateWithoutInitializing("cut_upper_bound_coordinates"),
2644  this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2645 
2646  // lower bound coordinate of a cut line
2647  this->cut_lower_bound_coordinates =
2648  Kokkos::View<mj_scalar_t*, device_t>(
2649  Kokkos::ViewAllocateWithoutInitializing("cut_lower_bound_coordinates"),
2650  this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2651 
2652  // lower bound weight of a cut line
2653  this->cut_lower_bound_weights =
2654  Kokkos::View<mj_scalar_t*, device_t>(
2655  Kokkos::ViewAllocateWithoutInitializing("cut_lower_bound_weights"),
2656  this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2657 
2658  //upper bound weight of a cut line
2659  this->cut_upper_bound_weights =
2660  Kokkos::View<mj_scalar_t*, device_t>(
2661  Kokkos::ViewAllocateWithoutInitializing("cut_upper_bound_weights"),
2662  this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2663 
2664  // combined array to exchange the min and max coordinate,
2665  // and total weight of part.
2666  this->process_local_min_max_coord_total_weight =
2667  Kokkos::View<mj_scalar_t*, device_t>(
2668  Kokkos::ViewAllocateWithoutInitializing(
2669  "process_local_min_max_coord_total_weight"),
2670  3 * this->max_concurrent_part_calculation);
2671 
2672  // global combined array with the results for min, max and total weight.
2673  this->global_min_max_coord_total_weight =
2674  Kokkos::View<mj_scalar_t*, device_t>(
2675  Kokkos::ViewAllocateWithoutInitializing("global_min_max_coord_total_weight"),
2676  3 * this->max_concurrent_part_calculation);
2677 
2678  // is_cut_line_determined is used to determine if a cutline is
2679  // determined already. If a cut line is already determined, the next
2680  // iterations will skip this cut line.
2681  this->is_cut_line_determined = Kokkos::View<bool *, device_t>(
2682  Kokkos::ViewAllocateWithoutInitializing("is_cut_line_determined"),
2683  this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2684 
2685  // incomplete_cut_count count holds the number of cutlines that have not
2686  // been finalized for each part when concurrentPartCount>1, using this
2687  // information, if incomplete_cut_count[x]==0, then no work is done for
2688  // this part.
2689  this->device_incomplete_cut_count = Kokkos::View<mj_part_t *, device_t>(
2690  Kokkos::ViewAllocateWithoutInitializing("device_incomplete_cut_count"),
2691  this->max_concurrent_part_calculation);
2692  this->incomplete_cut_count =
2693  Kokkos::create_mirror_view(device_incomplete_cut_count);
2694 
2695  // local part weights of each thread.
2696  this->thread_part_weights = Kokkos::View<double *, device_t>(
2697  Kokkos::ViewAllocateWithoutInitializing("thread_part_weights"),
2698  this->max_num_total_part_along_dim * this->max_concurrent_part_calculation);
2699 
2700  this->thread_cut_left_closest_point = Kokkos::View<mj_scalar_t *, device_t>(
2701  Kokkos::ViewAllocateWithoutInitializing("thread_cut_left_closest_point"),
2702  this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2703 
2704  // thread_cut_right_closest_point to hold the closest coordinate to a
2705  // cutline from right (for each thread)
2706  this->thread_cut_right_closest_point = Kokkos::View<mj_scalar_t *, device_t>(
2707  Kokkos::ViewAllocateWithoutInitializing("thread_cut_right_closest_point"),
2708  this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2709 
2710  // to store how many points in each part a thread has.
2711  this->thread_point_counts = Kokkos::View<mj_lno_t *, device_t>(
2712  Kokkos::ViewAllocateWithoutInitializing("thread_point_counts"),
2713  this->max_num_part_along_dim);
2714 
2715  // for faster communication, concatanation of
2716  // totalPartWeights sized 2P-1, since there are P parts and P-1 cut lines
2717  // leftClosest distances sized P-1, since P-1 cut lines
2718  // rightClosest distances size P-1, since P-1 cut lines.
2719  this->total_part_weight_left_right_closests =
2720  Kokkos::View<mj_scalar_t*, device_t>(
2721  Kokkos::ViewAllocateWithoutInitializing(
2722  "total_part_weight_left_right_closests"),
2723  (this->max_num_total_part_along_dim + this->max_num_cut_along_dim * 2) *
2724  this->max_concurrent_part_calculation);
2725 
2726  this->global_total_part_weight_left_right_closests =
2727  Kokkos::View<mj_scalar_t*, device_t>(
2728  Kokkos::ViewAllocateWithoutInitializing(
2729  "global_total_part_weight_left_right_closests"),
2730  (this->max_num_total_part_along_dim +
2731  this->max_num_cut_along_dim * 2) * this->max_concurrent_part_calculation);
2732 
2733  this->current_mj_gnos = Kokkos::View<mj_gno_t*, device_t>(
2734  Kokkos::ViewAllocateWithoutInitializing("gids"), num_local_coords);
2735 
2736  this->owner_of_coordinate = Kokkos::View<int *, Kokkos::HostSpace>(
2737  Kokkos::ViewAllocateWithoutInitializing("owner_of_coordinate"),
2738  num_local_coords);
2739 
2740  // changes owners back to host - so we don't run them on device
2741  // this improves migration code but means we have to serial init here.
2742  // Note we might allow this to be OpenMP when available even for CUDA.
2743  Kokkos::deep_copy(owner_of_coordinate, myActualRank);
2744 
2745  auto local_current_mj_gnos = current_mj_gnos;
2746  auto local_initial_mj_gnos = initial_mj_gnos;
2747  Kokkos::parallel_for(
2748  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2749  (0, num_local_coords), KOKKOS_LAMBDA (mj_lno_t j) {
2750  local_current_mj_gnos(j) = local_initial_mj_gnos(j);
2751  });
2752 }
2753 
2754 /* \brief compute the global bounding box
2755  */
2756 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2757  typename mj_part_t, typename mj_node_t>
2758 void AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t,
2759  mj_node_t>::compute_global_box()
2760 {
2761  //local min coords
2762  mj_scalar_t *mins = new mj_scalar_t[this->coord_dim];
2763  //global min coords
2764  mj_scalar_t *gmins = new mj_scalar_t[this->coord_dim];
2765  //local max coords
2766  mj_scalar_t *maxs = new mj_scalar_t[this->coord_dim];
2767  //global max coords
2768  mj_scalar_t *gmaxs = new mj_scalar_t[this->coord_dim];
2769 
2770  auto local_mj_coordinates = this->mj_coordinates;
2771 
2772  // If we are only doing 2 parts then we don't need these values
2773  // for y and z. Init them all to 0 first
2774  for(int i = 0; i < this->coord_dim; ++i) {
2775  mins[i] = 0;
2776  maxs[i] = 0;
2777  }
2778 
2779  for(int i = 0; i < std::min(this->recursion_depth, this->coord_dim); ++i) {
2780  Kokkos::parallel_reduce("MinReduce",
2781  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2782  (0, this->num_local_coords),
2783  KOKKOS_LAMBDA(mj_lno_t j, mj_scalar_t & running_min) {
2784  if(local_mj_coordinates(j,i) < running_min) {
2785  running_min = local_mj_coordinates(j,i);
2786  }
2787  }, Kokkos::Min<mj_scalar_t>(mins[i]));
2788  Kokkos::parallel_reduce("MaxReduce",
2789  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2790  (0, this->num_local_coords),
2791  KOKKOS_LAMBDA(mj_lno_t j, mj_scalar_t & running_max) {
2792  if(local_mj_coordinates(j,i) > running_max) {
2793  running_max = local_mj_coordinates(j,i);
2794  }
2795  }, Kokkos::Max<mj_scalar_t>(maxs[i]));
2796  }
2797 
2798  reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MIN,
2799  this->coord_dim, mins, gmins
2800  );
2801 
2802  reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MAX,
2803  this->coord_dim, maxs, gmaxs
2804  );
2805 
2806  //create single box with all areas.
2807  global_box = rcp(new mj_partBox_t(0,this->coord_dim,gmins,gmaxs));
2808  //coordinateModelPartBox <mj_scalar_t, mj_part_t> tmpBox (0, coordDim);
2809  delete [] mins;
2810  delete [] gmins;
2811  delete [] maxs;
2812  delete [] gmaxs;
2813 }
2814 
2815 /* \brief for part communication we keep track of the box boundaries.
2816  * This is performed when either asked specifically, or when geometric mapping
2817  * is performed afterwards.
2818  * This function initializes a single box with all global min, max coordinates.
2819  * \param initial_partitioning_boxes the input and output vector for boxes.
2820  */
2821 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2822  typename mj_part_t, typename mj_node_t>
2823 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
2824  mj_node_t>::init_part_boxes(
2825  RCP<mj_partBoxVector_t> & initial_partitioning_boxes)
2826 {
2827  mj_partBox_t tmp_box(*global_box);
2828  initial_partitioning_boxes->push_back(tmp_box);
2829 }
2830 
2835 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2836  typename mj_part_t,
2837  typename mj_node_t>
2838 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
2839  mj_get_local_min_max_coord_totW(
2840  mj_part_t current_work_part,
2841  mj_part_t current_concurrent_num_parts,
2842  Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords)
2843 {
2844  auto local_coordinate_permutations = this->coordinate_permutations;
2845  auto local_process_local_min_max_coord_total_weight =
2846  this->process_local_min_max_coord_total_weight;
2847  auto local_mj_weights = this->mj_weights;
2848 
2849  bool bUniformWeights = mj_uniform_weights(0);
2850 
2851  for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
2852 
2853  mj_part_t concurrent_current_part = current_work_part + kk;
2854  mj_lno_t coordinate_begin_index = concurrent_current_part == 0 ? 0 :
2855  host_part_xadj(concurrent_current_part - 1);
2856  mj_lno_t coordinate_end_index =
2857  host_part_xadj(concurrent_current_part);
2858 
2859  mj_scalar_t my_min_coord = 0;
2860  mj_scalar_t my_max_coord = 0;
2861  mj_scalar_t my_total_weight;
2862  //if the part is empty.
2863  //set the min and max coordinates as reverse.
2864  if(coordinate_begin_index >= coordinate_end_index)
2865  {
2866  my_min_coord = std::numeric_limits<mj_scalar_t>::max();
2867  my_max_coord = -std::numeric_limits<mj_scalar_t>::max();
2868  my_total_weight = 0;
2869  }
2870  else {
2871  // get min
2872  Kokkos::parallel_reduce("get min",
2873  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2874  (coordinate_begin_index, coordinate_end_index),
2875  KOKKOS_LAMBDA (mj_lno_t j, mj_scalar_t & running_min) {
2876  int i = local_coordinate_permutations(j);
2877  if(mj_current_dim_coords(i) < running_min)
2878  running_min = mj_current_dim_coords(i);
2879  }, Kokkos::Min<mj_scalar_t>(my_min_coord));
2880  // get max
2881  Kokkos::parallel_reduce("get max",
2882  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2883  (coordinate_begin_index, coordinate_end_index),
2884  KOKKOS_LAMBDA (mj_lno_t j, mj_scalar_t & running_max) {
2885  int i = local_coordinate_permutations(j);
2886  if(mj_current_dim_coords(i) > running_max)
2887  running_max = mj_current_dim_coords(i);
2888  }, Kokkos::Max<mj_scalar_t>(my_max_coord));
2889  if(bUniformWeights) {
2890  my_total_weight = coordinate_end_index - coordinate_begin_index;
2891  }
2892  else {
2893  my_total_weight = 0;
2894  Kokkos::parallel_reduce("get weight",
2895  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2896  (coordinate_begin_index, coordinate_end_index),
2897  KOKKOS_LAMBDA (mj_lno_t j, mj_scalar_t & lsum) {
2898  int i = local_coordinate_permutations(j);
2899  lsum += local_mj_weights(i,0);
2900  }, my_total_weight);
2901  }
2902  }
2903 
2904  // single write
2905  Kokkos::parallel_for(
2906  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
2907  (0, 1), KOKKOS_LAMBDA (int dummy) {
2908  local_process_local_min_max_coord_total_weight(kk) =
2909  my_min_coord;
2910  local_process_local_min_max_coord_total_weight(
2911  kk + current_concurrent_num_parts) = my_max_coord;
2912  local_process_local_min_max_coord_total_weight(
2913  kk + 2*current_concurrent_num_parts) = my_total_weight;
2914  });
2915  }
2916 }
2917 
2930 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2931  typename mj_part_t, typename mj_node_t>
2932 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
2933  mj_node_t>::mj_get_global_min_max_coord_totW(
2934  mj_part_t current_concurrent_num_parts,
2935  Kokkos::View<mj_scalar_t *, device_t> & local_min_max_total,
2936  Kokkos::View<mj_scalar_t *, device_t> & global_min_max_total) {
2937  // reduce min for first current_concurrent_num_parts elements, reduce
2938  // max for next concurrentPartCount elements, reduce sum for the last
2939  // concurrentPartCount elements.
2940  if(this->comm->getSize() > 1) {
2941  // We're using explicit host here as Spectrum MPI would fail
2942  // with the prior HostMirror UVMSpace to UVMSpace setup.
2943  auto host_local_min_max_total =
2944  Kokkos::create_mirror_view(Kokkos::HostSpace(), local_min_max_total);
2945  auto host_global_min_max_total =
2946  Kokkos::create_mirror_view(Kokkos::HostSpace(), global_min_max_total);
2947  Kokkos::deep_copy(host_local_min_max_total, local_min_max_total);
2949  reductionOp(current_concurrent_num_parts,
2950  current_concurrent_num_parts, current_concurrent_num_parts);
2951  try {
2952  reduceAll<int, mj_scalar_t>(
2953  *(this->comm),
2954  reductionOp,
2955  3 * current_concurrent_num_parts,
2956  host_local_min_max_total.data(),
2957  host_global_min_max_total.data());
2958  }
2959  Z2_THROW_OUTSIDE_ERROR(*(this->mj_env))
2960  Kokkos::deep_copy(global_min_max_total, host_global_min_max_total);
2961  }
2962  else {
2963  mj_part_t s = 3 * current_concurrent_num_parts;
2964  Kokkos::parallel_for(
2965  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
2966  (0, s), KOKKOS_LAMBDA (mj_part_t i) {
2967  global_min_max_total(i) = local_min_max_total(i);
2968  });
2969  }
2970 }
2971 
3004 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3005  typename mj_part_t, typename mj_node_t>
3006 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
3007  mj_get_initial_cut_coords_target_weights(
3008  mj_scalar_t min_coord,
3009  mj_scalar_t max_coord,
3010  mj_part_t num_cuts/*p-1*/ ,
3011  mj_scalar_t global_weight,
3012  /*p - 1 sized, coordinate of each cut line*/
3013  Kokkos::View<mj_scalar_t *, device_t> & initial_cut_coords,
3014  /*cumulative weights, at left side of each cut line. p-1 sized*/
3015  Kokkos::View<mj_scalar_t *, device_t> & current_target_part_weights ,
3016  std::vector <mj_part_t> *future_num_part_in_parts, //the vecto
3017  std::vector <mj_part_t> *next_future_num_parts_in_parts,
3018  mj_part_t concurrent_current_part,
3019  mj_part_t obtained_part_index,
3020  mj_part_t num_target_first_level_parts,
3021  const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & target_first_level_dist)
3022 {
3023  mj_scalar_t coord_range = max_coord - min_coord;
3024 
3025  // We decided we could keep some std::vectors around for now. Eventually
3026  // it would be nice to have everything just as views with some being device
3027  // and some host. This particular case needs a bit of work to get setup
3028  // in a cleaner way so not going to mess with it at the moment.
3029 
3030  bool bUniformPartsCheck =
3031  num_target_first_level_parts <= 1 && this->mj_uniform_parts(0);
3032 
3033  if(!bUniformPartsCheck) {
3034  bool bValidNonUniformTargetWeights =
3035  (num_target_first_level_parts > 1 && target_first_level_dist.size() != 0);
3036  if(!bValidNonUniformTargetWeights) {
3037  std::cerr << "MJ does not support non uniform part weights beyond the first partition" << std::endl;
3038  std::terminate();
3039  }
3040  }
3041 
3042  Kokkos::View<mj_scalar_t*, device_t> device_cumulative(
3043  "device_cumulative", num_cuts);
3044  auto host_cumulative = Kokkos::create_mirror_view(device_cumulative);
3045 
3046  mj_scalar_t cumulative = 0;
3047 
3048  if(bUniformPartsCheck) {
3049  // How many total future parts the part will be partitioned into.
3050  mj_scalar_t total_future_part_count_in_part =
3051  static_cast<mj_scalar_t>((*future_num_part_in_parts)[concurrent_current_part]);
3052 
3053  // How much each part should weigh in ideal case.
3054  mj_scalar_t unit_part_weight =
3055  global_weight / total_future_part_count_in_part;
3056 
3057  for(mj_part_t i = 0; i < num_cuts; ++i) {
3058  cumulative += unit_part_weight * static_cast<mj_scalar_t>((*next_future_num_parts_in_parts)[i + obtained_part_index]);
3059  host_cumulative(i) = cumulative;
3060  }
3061  }
3062  else {
3063  // Sum of entries in the first level partition distribution vector
3064  mj_scalar_t sum_target_first_level_dist = 0.0;
3065  for (int i = 0; i < num_target_first_level_parts; ++i) {
3066  sum_target_first_level_dist += target_first_level_dist(i);
3067  }
3068 
3069  for(mj_part_t i = 0; i < num_cuts; ++i) {
3070  cumulative += global_weight * target_first_level_dist(i) /
3071  sum_target_first_level_dist;
3072  host_cumulative(i) = cumulative;
3073  }
3074  }
3075 
3076  Kokkos::deep_copy(device_cumulative, host_cumulative);
3077 
3078  Kokkos::parallel_for("Write num in parts",
3079  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
3080  (0, num_cuts), KOKKOS_LAMBDA(mj_part_t cut) {
3081  // set target part weight.
3082  current_target_part_weights(cut) = device_cumulative(cut);
3083  initial_cut_coords(cut) = min_coord +
3084  (coord_range * device_cumulative(cut)) / global_weight;
3085  // set this multiple times but here for device handling
3086  current_target_part_weights(num_cuts) = global_weight;
3087  });
3088 
3089  // round the target part weights.
3090  // Note need to discuss regarding DragonFly commits and determine if we
3091  // would not simply check mj_uniform_weights here.
3092  if (!bUniformPartsCheck || this->mj_uniform_weights[0]) {
3093  Kokkos::parallel_for(
3094  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
3095  (0, num_cuts + 1),
3096  KOKKOS_LAMBDA (mj_part_t i) {
3097  current_target_part_weights(i) =
3098  long(current_target_part_weights(i) + 0.5);
3099  });
3100  }
3101 }
3102 
3119 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3120  typename mj_part_t, typename mj_node_t>
3121 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
3122  set_initial_coordinate_parts(
3123  mj_scalar_t &max_coordinate,
3124  mj_scalar_t &min_coordinate,
3125  mj_lno_t coordinate_begin_index,
3126  mj_lno_t coordinate_end_index,
3127  Kokkos::View<mj_lno_t *, device_t> & mj_current_coordinate_permutations,
3128  Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
3129  Kokkos::View<mj_part_t *, device_t> & mj_part_ids,
3130  mj_part_t &partition_count)
3131 {
3132  mj_scalar_t coordinate_range = max_coordinate - min_coordinate;
3133 
3134  // if there is single point, or if all points are along a line.
3135  // set initial part to 0 for all.
3136  if(std::abs(coordinate_range) < this->sEpsilon ) {
3137  Kokkos::parallel_for(
3138  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
3139  (coordinate_begin_index, coordinate_end_index),
3140  KOKKOS_LAMBDA (mj_lno_t ii) {
3141  mj_part_ids(mj_current_coordinate_permutations[ii]) = 0;
3142  });
3143  }
3144  else {
3145  // otherwise estimate an initial part for each coordinate.
3146  // assuming uniform distribution of points.
3147  mj_scalar_t slice = coordinate_range / partition_count;
3148  Kokkos::parallel_for(
3149  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
3150  (coordinate_begin_index, coordinate_end_index),
3151  KOKKOS_LAMBDA (mj_lno_t ii) {
3152  mj_lno_t iii = mj_current_coordinate_permutations[ii];
3153  mj_part_t pp =
3154  mj_part_t((mj_current_dim_coords[iii] - min_coordinate) / slice);
3155  if(pp >= partition_count) {
3156  pp = partition_count - 1; // don't want last coord in an invalid part
3157  }
3158  mj_part_ids[iii] = 2 * pp;
3159  });
3160  }
3161 }
3162 
3177 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3178  typename mj_part_t, typename mj_node_t>
3179 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,mj_node_t>::mj_1D_part(
3180  Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
3181  double used_imbalance_tolerance,
3182  mj_part_t current_work_part,
3183  mj_part_t current_concurrent_num_parts,
3184  Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
3185  mj_part_t total_incomplete_cut_count,
3186  Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count,
3187  Kokkos::View<size_t*, device_t> & view_total_reduction_size)
3188 {
3189  this->temp_cut_coords = current_cut_coordinates;
3190 
3192  *reductionOp = NULL;
3193 
3194  bool bSingleProcess = (this->comm->getSize() == 1);
3195 
3196  std::vector<mj_part_t> temp(host_num_partitioning_in_current_dim.size());
3197  if(!bSingleProcess) {
3198  for(size_t n = 0; n < host_num_partitioning_in_current_dim.size(); ++n) {
3199  temp[n] = host_num_partitioning_in_current_dim(n);
3200  }
3201  reductionOp = new Teuchos::MultiJaggedCombinedReductionOp
3202  <mj_part_t, mj_scalar_t>(
3203  &temp,
3204  current_work_part,
3205  current_concurrent_num_parts);
3206  }
3207 
3208  auto local_cut_lower_bound_coordinates =
3209  cut_lower_bound_coordinates;
3210  auto local_cut_upper_bound_coordinates =
3211  cut_upper_bound_coordinates;
3212  auto local_cut_upper_bound_weights = cut_upper_bound_weights;
3213  auto local_cut_lower_bound_weights = cut_lower_bound_weights;
3214  bool local_distribute_points_on_cut_lines = distribute_points_on_cut_lines;
3215  auto local_process_cut_line_weight_to_put_left =
3216  process_cut_line_weight_to_put_left;
3217  auto local_temp_cut_coords = temp_cut_coords;
3218  auto local_global_total_part_weight_left_right_closests =
3219  global_total_part_weight_left_right_closests;
3220  auto local_cut_coordinates_work_array =
3221  cut_coordinates_work_array;
3222  auto local_part_xadj = part_xadj;
3223  auto local_global_min_max_coord_total_weight =
3224  global_min_max_coord_total_weight;
3225  auto local_target_part_weights =
3226  target_part_weights;
3227  auto local_global_rectilinear_cut_weight =
3228  global_rectilinear_cut_weight;
3229  auto local_process_rectilinear_cut_weight =
3230  process_rectilinear_cut_weight;
3231 
3232  auto local_is_cut_line_determined = this->is_cut_line_determined;
3233  auto local_device_num_partitioning_in_current_dim =
3234  device_num_partitioning_in_current_dim;
3235 
3236  Kokkos::parallel_for(
3237  Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
3238  KOKKOS_LAMBDA (int dummy) {
3239 
3240  // these need to be initialized
3241  view_rectilinear_cut_count(0) = 0;
3242  view_total_reduction_size(0) = 0;
3243 
3244  // initialize the lower and upper bounds of the cuts.
3245  mj_part_t next = 0;
3246  for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i) {
3247  mj_part_t num_part_in_dim =
3248  local_device_num_partitioning_in_current_dim(current_work_part + i);
3249  mj_part_t num_cut_in_dim = num_part_in_dim - 1;
3250  view_total_reduction_size(0) += (4 * num_cut_in_dim + 1);
3251 
3252  for(mj_part_t ii = 0; ii < num_cut_in_dim; ++ii) {
3253  local_is_cut_line_determined(next) = false;
3254  // min coordinate
3255  local_cut_lower_bound_coordinates(next) =
3256  local_global_min_max_coord_total_weight(i);
3257  // max coordinate
3258  local_cut_upper_bound_coordinates(next) =
3259  local_global_min_max_coord_total_weight(
3260  i + current_concurrent_num_parts);
3261  // total weight
3262  local_cut_upper_bound_weights(next) =
3263  local_global_min_max_coord_total_weight(
3264  i + 2 * current_concurrent_num_parts);
3265  local_cut_lower_bound_weights(next) = 0;
3266  if(local_distribute_points_on_cut_lines) {
3267  local_process_cut_line_weight_to_put_left(next) = 0;
3268  }
3269  ++next;
3270  }
3271  }
3272  });
3273 
3274  // loop_count allows the kernel to behave differently on the first loop
3275  // and subsequent loops. First loop we do a binary search and subsequent
3276  // loops we simply step towards our target.
3277  int loop_count = 0;
3278  while (total_incomplete_cut_count != 0) {
3279  this->mj_1D_part_get_part_weights(
3280  current_concurrent_num_parts,
3281  current_work_part,
3282  mj_current_dim_coords,
3283  loop_count);
3284  ++loop_count;
3285 
3286  this->mj_combine_rightleft_and_weights(
3287  current_work_part,
3288  current_concurrent_num_parts);
3289 
3290  // now sum up the results of mpi processors.
3291  if(!bSingleProcess) {
3292  // We're using explicit host here as Spectrum MPI would fail
3293  // with the prior HostMirror UVMSpace to UVMSpace setup.
3294  auto host_total_part_weight_left_right_closests =
3295  Kokkos::create_mirror_view(Kokkos::HostSpace(),
3296  total_part_weight_left_right_closests);
3297  auto host_global_total_part_weight_left_right_closests =
3298  Kokkos::create_mirror_view(Kokkos::HostSpace(),
3299  global_total_part_weight_left_right_closests);
3300 
3301  Kokkos::deep_copy(host_total_part_weight_left_right_closests,
3302  total_part_weight_left_right_closests);
3303 
3304  size_t host_view_total_reduction_size;
3305  Kokkos::parallel_reduce("Read single",
3306  Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
3307  KOKKOS_LAMBDA(int dummy, size_t & set_single) {
3308  set_single = view_total_reduction_size(0);
3309  }, host_view_total_reduction_size);
3310 
3311  reduceAll<int, mj_scalar_t>( *(this->comm), *reductionOp,
3312  host_view_total_reduction_size,
3313  host_total_part_weight_left_right_closests.data(),
3314  host_global_total_part_weight_left_right_closests.data());
3315  Kokkos::deep_copy(global_total_part_weight_left_right_closests,
3316  host_global_total_part_weight_left_right_closests);
3317  }
3318  else {
3319  local_global_total_part_weight_left_right_closests =
3320  this->total_part_weight_left_right_closests;
3321  }
3322 
3323  // how much cut will be shifted for the next part in the concurrent
3324  // part calculation.
3325  mj_part_t cut_shift = 0;
3326 
3327  // how much the concantaneted array will be shifted for the next part
3328  // in concurrent part calculation.
3329  size_t tlr_shift = 0;
3330 
3331  Kokkos::View<mj_part_t*, Kokkos::HostSpace>
3332  save_initial_incomplete_cut_count("save_initial_incomplete_cut_count",
3333  current_concurrent_num_parts);
3334 
3335  for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
3336 
3337  mj_part_t num_parts =
3338  host_num_partitioning_in_current_dim(current_work_part + kk);
3339 
3340  mj_part_t num_cuts = num_parts - 1;
3341  size_t num_total_part = num_parts + size_t (num_cuts);
3342 
3343  //if the cuts of this cut has already been completed.
3344  //nothing to do for this part.
3345  //just update the shift amount and proceed.
3346  mj_part_t kk_incomplete_cut_count = this->incomplete_cut_count(kk);
3347 
3348  if(kk_incomplete_cut_count == 0) {
3349  cut_shift += num_cuts;
3350  tlr_shift += (num_total_part + 2 * num_cuts);
3351  continue;
3352  }
3353 
3354  Kokkos::View<mj_scalar_t *, device_t> current_local_part_weights =
3355  Kokkos::subview(this->total_part_weight_left_right_closests,
3356  std::pair<mj_lno_t, mj_lno_t>(
3357  tlr_shift,
3358  this->total_part_weight_left_right_closests.size()));
3359 
3360  Kokkos::View<mj_scalar_t *, device_t> current_global_tlr =
3361  Kokkos::subview(
3362  local_global_total_part_weight_left_right_closests,
3363  std::pair<mj_lno_t, mj_lno_t>(
3364  tlr_shift,
3365  local_global_total_part_weight_left_right_closests.size()));
3366  Kokkos::View<mj_scalar_t *, device_t>
3367  current_global_left_closest_points =
3368  Kokkos::subview(current_global_tlr,
3369  std::pair<mj_lno_t, mj_lno_t>(
3370  num_total_part,
3371  current_global_tlr.size()));
3372  Kokkos::View<mj_scalar_t *, device_t>
3373  current_global_right_closest_points =
3374  Kokkos::subview(current_global_tlr,
3375  std::pair<mj_lno_t, mj_lno_t>(
3376  num_total_part + num_cuts,
3377  current_global_tlr.size()));
3378  Kokkos::View<mj_scalar_t *, device_t> current_global_part_weights =
3379  current_global_tlr;
3380 
3381  Kokkos::View<bool *, device_t> current_cut_line_determined =
3382  Kokkos::subview(this->is_cut_line_determined,
3383  std::pair<mj_lno_t, mj_lno_t>(
3384  cut_shift,
3385  this->is_cut_line_determined.size()));
3386  Kokkos::View<mj_scalar_t *, device_t> current_part_target_weights =
3387  Kokkos::subview(local_target_part_weights,
3388  std::pair<mj_lno_t, mj_lno_t>(
3389  cut_shift + kk,
3390  local_target_part_weights.size()));
3391  Kokkos::View<mj_scalar_t *, device_t>
3392  current_part_cut_line_weight_to_put_left =
3393  Kokkos::subview(local_process_cut_line_weight_to_put_left,
3394  std::pair<mj_lno_t, mj_lno_t>(
3395  cut_shift,
3396  local_process_cut_line_weight_to_put_left.size()));
3397 
3398  save_initial_incomplete_cut_count(kk) =
3399  kk_incomplete_cut_count;
3400 
3401  Kokkos::View<mj_scalar_t *, device_t>
3402  current_cut_lower_bound_weights =
3403  Kokkos::subview(local_cut_lower_bound_weights,
3404  std::pair<mj_lno_t, mj_lno_t>(
3405  cut_shift,
3406  local_cut_lower_bound_weights.size()));
3407  Kokkos::View<mj_scalar_t *, device_t> current_cut_upper_weights =
3408  Kokkos::subview(local_cut_upper_bound_weights,
3409  std::pair<mj_lno_t, mj_lno_t>(
3410  cut_shift,
3411  local_cut_upper_bound_weights.size()));
3412  Kokkos::View<mj_scalar_t *, device_t> current_cut_upper_bounds =
3413  Kokkos::subview(local_cut_upper_bound_coordinates,
3414  std::pair<mj_lno_t, mj_lno_t>(
3415  cut_shift,
3416  local_cut_upper_bound_coordinates.size()));
3417  Kokkos::View<mj_scalar_t *, device_t> current_cut_lower_bounds =
3418  Kokkos::subview(local_cut_lower_bound_coordinates,
3419  std::pair<mj_lno_t, mj_lno_t>(
3420  cut_shift,
3421  local_cut_lower_bound_coordinates.size()));
3422 
3423  // Now compute the new cut coordinates.
3424  Kokkos::View<mj_scalar_t*, device_t> sub_temp_cut_coords =
3425  Kokkos::subview(this->temp_cut_coords,
3426  std::pair<mj_lno_t, mj_lno_t>(
3427  cut_shift, this->temp_cut_coords.size()));
3428  Kokkos::View<mj_scalar_t*, device_t> sub_cut_coordinates_work_array =
3429  Kokkos::subview(this->cut_coordinates_work_array,
3430  std::pair<mj_lno_t, mj_lno_t>(
3431  cut_shift, this->cut_coordinates_work_array.size()));
3432 
3433  this->mj_get_new_cut_coordinates(
3434  current_concurrent_num_parts,
3435  kk,
3436  num_cuts,
3437  used_imbalance_tolerance,
3438  current_global_part_weights,
3439  current_local_part_weights,
3440  current_part_target_weights,
3441  current_cut_line_determined,
3442  sub_temp_cut_coords,
3443  current_cut_upper_bounds,
3444  current_cut_lower_bounds,
3445  current_global_left_closest_points,
3446  current_global_right_closest_points,
3447  current_cut_lower_bound_weights,
3448  current_cut_upper_weights,
3449  sub_cut_coordinates_work_array,
3450  current_part_cut_line_weight_to_put_left,
3451  view_rectilinear_cut_count);
3452 
3453  cut_shift += num_cuts;
3454  tlr_shift += (num_total_part + 2 * num_cuts);
3455  } // end of kk loop
3456 
3457  for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
3458  mj_part_t iteration_complete_cut_count =
3459  save_initial_incomplete_cut_count(kk) - this->incomplete_cut_count(kk);
3460  total_incomplete_cut_count -= iteration_complete_cut_count;
3461  }
3462 
3463  Kokkos::parallel_for(
3464  Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
3465  (0, local_temp_cut_coords.size()), KOKKOS_LAMBDA(int n) {
3466  auto t = local_temp_cut_coords(n);
3467  local_temp_cut_coords(n) = local_cut_coordinates_work_array(n);
3468  local_cut_coordinates_work_array(n) = t;
3469  });
3470  } // end of the while loop
3471 
3472  // Needed only if keep_cuts; otherwise can simply swap array pointers
3473  // cutCoordinates and cutCoordinatesWork.
3474  // (at first iteration, cutCoordinates == cutCoorindates_tmp).
3475  // computed cuts must be in cutCoordinates.
3476  if(current_cut_coordinates != local_temp_cut_coords) {
3477  Kokkos::parallel_for(
3478  Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
3479  (0, 1), KOKKOS_LAMBDA(int dummy) {
3480  mj_part_t next = 0;
3481  for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i) {
3482  mj_part_t num_parts = -1;
3483  num_parts = local_device_num_partitioning_in_current_dim(
3484  current_work_part + i);
3485  mj_part_t num_cuts = num_parts - 1;
3486  for(mj_part_t ii = 0; ii < num_cuts; ++ii) {
3487  current_cut_coordinates(next + ii) = local_temp_cut_coords(next + ii);
3488  }
3489  next += num_cuts;
3490  }
3491  for(int n = 0; n <
3492  static_cast<int>(local_cut_coordinates_work_array.size()); ++n) {
3493  local_cut_coordinates_work_array(n) = local_temp_cut_coords(n);
3494  }
3495  });
3496  }
3497 
3498  delete reductionOp;
3499 }
3500 
3501 template<class scalar_t>
3503  scalar_t * ptr;
3504 
3505  // With new kokkos setup parallel_reduce will call empty constructor and
3506  // we update the ptr in the init method.
3507  KOKKOS_INLINE_FUNCTION
3508  Zoltan2_MJArrayType() : ptr(NULL) {};
3509 
3510  KOKKOS_INLINE_FUNCTION
3511  Zoltan2_MJArrayType(scalar_t * pSetPtr) : ptr(pSetPtr) {};
3512 };
3513 
3514 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
3515 
3516 template<class policy_t, class scalar_t, class part_t>
3518 
3521  scalar_t max_scalar;
3525 
3526  KOKKOS_INLINE_FUNCTION ArrayCombinationReducer(
3527  scalar_t mj_max_scalar,
3528  value_type &val,
3529  int mj_value_count_rightleft,
3530  int mj_value_count_weights) :
3531  max_scalar(mj_max_scalar),
3532  value(&val),
3533  value_count_rightleft(mj_value_count_rightleft),
3534  value_count_weights(mj_value_count_weights)
3535  {}
3536 
3537  KOKKOS_INLINE_FUNCTION
3539  return *value;
3540  }
3541 
3542  KOKKOS_INLINE_FUNCTION
3543  void join(value_type& dst, const value_type& src) const {
3544  for(int n = 0; n < value_count_weights; ++n) {
3545  dst.ptr[n] += src.ptr[n];
3546  }
3547 
3548  for(int n = value_count_weights + 2;
3549  n < value_count_weights + value_count_rightleft - 2; n += 2) {
3550  if(src.ptr[n] > dst.ptr[n]) {
3551  dst.ptr[n] = src.ptr[n];
3552  }
3553  if(src.ptr[n+1] < dst.ptr[n+1]) {
3554  dst.ptr[n+1] = src.ptr[n+1];
3555  }
3556  }
3557  }
3558 
3559  KOKKOS_INLINE_FUNCTION
3560  void join (volatile value_type& dst, const volatile value_type& src) const {
3561  for(int n = 0; n < value_count_weights; ++n) {
3562  dst.ptr[n] += src.ptr[n];
3563  }
3564 
3565  for(int n = value_count_weights + 2;
3566  n < value_count_weights + value_count_rightleft - 2; n += 2) {
3567  if(src.ptr[n] > dst.ptr[n]) {
3568  dst.ptr[n] = src.ptr[n];
3569  }
3570  if(src.ptr[n+1] < dst.ptr[n+1]) {
3571  dst.ptr[n+1] = src.ptr[n+1];
3572  }
3573  }
3574  }
3575 
3576  KOKKOS_INLINE_FUNCTION void init (value_type& dst) const {
3577  dst.ptr = value->ptr; // must update ptr
3578 
3579  for(int n = 0; n < value_count_weights; ++n) {
3580  dst.ptr[n] = 0;
3581  }
3582 
3583  for(int n = value_count_weights;
3584  n < value_count_weights + value_count_rightleft; n += 2) {
3585  dst.ptr[n] = -max_scalar;
3586  dst.ptr[n+1] = max_scalar;
3587  }
3588  }
3589 };
3590 #endif // KOKKOS_ENABLE_CUDA && KOKKOS_ENABLE_HIP
3591 
3592 template<class policy_t, class scalar_t, class part_t, class index_t,
3593  class device_t, class array_t>
3595  typedef typename policy_t::member_type member_type;
3596  typedef Kokkos::View<scalar_t*> scalar_view_t;
3597 
3598 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
3599  typedef array_t value_type[];
3600 #endif
3601 
3603  array_t max_scalar;
3604 
3606  part_t num_cuts;
3612  Kokkos::View<index_t*, device_t> permutations;
3613  Kokkos::View<scalar_t *, device_t> coordinates;
3614  Kokkos::View<scalar_t**, device_t> weights;
3615  Kokkos::View<part_t*, device_t> parts;
3616  Kokkos::View<scalar_t *, device_t> cut_coordinates;
3617  Kokkos::View<index_t *, device_t> part_xadj;
3619  scalar_t sEpsilon;
3620 
3621 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3622  Kokkos::View<double *, device_t> current_part_weights;
3623  Kokkos::View<scalar_t *, device_t> current_left_closest;
3624  Kokkos::View<scalar_t *, device_t> current_right_closest;
3625 #endif // KOKKOS_ENABLE_CUDA || defined(KOKKOS_ENABLE_HIP)
3626 
3628  int mj_loop_count,
3629  array_t mj_max_scalar,
3630  part_t mj_concurrent_current_part,
3631  part_t mj_num_cuts,
3632  part_t mj_current_work_part,
3633  part_t mj_current_concurrent_num_parts,
3634  part_t mj_left_right_array_size,
3635  part_t mj_weight_array_size,
3636  Kokkos::View<index_t*, device_t> & mj_permutations,
3637  Kokkos::View<scalar_t *, device_t> & mj_coordinates,
3638  Kokkos::View<scalar_t**, device_t> & mj_weights,
3639  Kokkos::View<part_t*, device_t> & mj_parts,
3640  Kokkos::View<scalar_t *, device_t> & mj_cut_coordinates,
3641  Kokkos::View<index_t *, device_t> & mj_part_xadj,
3642  bool mj_uniform_weights0,
3643  scalar_t mj_sEpsilon
3644 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3645  ,Kokkos::View<double *, device_t> & mj_current_part_weights,
3646  Kokkos::View<scalar_t *, device_t> & mj_current_left_closest,
3647  Kokkos::View<scalar_t *, device_t> & mj_current_right_closest
3648 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3649  ) :
3650  loop_count(mj_loop_count),
3651  max_scalar(mj_max_scalar),
3652  concurrent_current_part(mj_concurrent_current_part),
3653  num_cuts(mj_num_cuts),
3654  current_work_part(mj_current_work_part),
3655  current_concurrent_num_parts(mj_current_concurrent_num_parts),
3656  value_count_rightleft(mj_left_right_array_size),
3657  value_count_weights(mj_weight_array_size),
3658  value_count(mj_weight_array_size+mj_left_right_array_size),
3659  permutations(mj_permutations),
3660  coordinates(mj_coordinates),
3661  weights(mj_weights),
3662  parts(mj_parts),
3663  cut_coordinates(mj_cut_coordinates),
3664  part_xadj(mj_part_xadj),
3665  uniform_weights0(mj_uniform_weights0),
3666  sEpsilon(mj_sEpsilon)
3667 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3668  ,current_part_weights(mj_current_part_weights),
3669  current_left_closest(mj_current_left_closest),
3670  current_right_closest(mj_current_right_closest)
3671 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3672  {
3673  }
3674 
3675  size_t team_shmem_size (int team_size) const {
3676 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3677  int result = sizeof(array_t) *
3679 #else
3680  int result = sizeof(array_t) *
3682 #endif
3683 
3684  // pad this to a multiple of 8 or it will run corrupt
3685  int remainder = result % 8;
3686  if(remainder != 0) {
3687  result += 8 - remainder;
3688  }
3689  return result;
3690  }
3691 
3692  KOKKOS_INLINE_FUNCTION
3693 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3694  void operator() (const member_type & teamMember) const {
3695 #else
3696  void operator() (const member_type & teamMember, value_type teamSum) const {
3697 #endif
3698 
3699  index_t all_begin = (concurrent_current_part == 0) ? 0 :
3700  part_xadj(concurrent_current_part - 1);
3701  index_t all_end = part_xadj(concurrent_current_part);
3702 
3703  index_t num_working_points = all_end - all_begin;
3704  int num_teams = teamMember.league_size();
3705 
3706  index_t stride = num_working_points / num_teams;
3707  if((num_working_points % num_teams) > 0) {
3708  stride += 1; // make sure we have coverage for the final points
3709  }
3710 
3711  // the last team may have less work than the other teams
3712  // the last team can be empty (begin > end) if num_teams > stride
3713  // which is true for many teams and small numbers of coords (tests)
3714  index_t begin = all_begin + stride * teamMember.league_rank();
3715  index_t end = begin + stride;
3716  if(end > all_end) {
3717  end = all_end;
3718  }
3719 
3720 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3721  size_t sh_mem_size = sizeof(array_t) * (value_count_weights +
3723 
3724  array_t * shared_ptr = (array_t *) teamMember.team_shmem().get_shmem(
3725  sh_mem_size);
3726 
3727  // init the shared array to 0
3728  Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
3729  for(int n = 0; n < value_count_weights; ++n) {
3730  shared_ptr[n] = 0;
3731  }
3732  for(int n = value_count_weights;
3733  n < value_count_weights + value_count_rightleft; n += 2) {
3734  shared_ptr[n] = -max_scalar;
3735  shared_ptr[n+1] = max_scalar;
3736  }
3737  });
3738  teamMember.team_barrier();
3739 
3740  Kokkos::parallel_for(
3741  Kokkos::TeamThreadRange(teamMember, begin, end),
3742  [=] (index_t ii) {
3743 #else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3744  // create the team shared data - each thread gets one of the arrays
3745  size_t sh_mem_size = sizeof(array_t) * (value_count_weights +
3746  value_count_rightleft) * teamMember.team_size();
3747 
3748  array_t * shared_ptr = (array_t *) teamMember.team_shmem().get_shmem(
3749  sh_mem_size);
3750 
3751  // select the array for this thread
3752  Zoltan2_MJArrayType<array_t> array(&shared_ptr[teamMember.team_rank() *
3754 
3755  // create reducer which handles the Zoltan2_MJArrayType class
3757  max_scalar, array,
3760 
3761  Kokkos::parallel_reduce(
3762  Kokkos::TeamThreadRange(teamMember, begin, end),
3763 #if (__cplusplus > 201703L)
3764  [=, this] (size_t ii, Zoltan2_MJArrayType<array_t>& threadSum) {
3765 #else
3766  [=] (size_t ii, Zoltan2_MJArrayType<array_t>& threadSum) {
3767 #endif
3768 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3769 
3770  int i = permutations(ii);
3771  scalar_t coord = coordinates(i);
3772  array_t w = uniform_weights0 ? 1 : (array_t) weights(i,0);
3773 
3774  // now check each part and it's right cut
3775  index_t part = parts(i)/2;
3776 
3777  int upper = num_cuts;
3778  int lower = 0;
3779 
3780  // binary search - find matching part
3781  while(true) {
3782  scalar_t a = (part == 0) ? -max_scalar : cut_coordinates(part-1);
3783  scalar_t b = (part == num_cuts) ? max_scalar : cut_coordinates(part);
3784 
3785  if(coord >= a + sEpsilon && coord <= b - sEpsilon) {
3786 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3787  Kokkos::atomic_add(&shared_ptr[part*2], w);
3788 #else
3789  threadSum.ptr[part*2] += w;
3790 #endif
3791 
3792  parts(i) = part*2;
3793 
3794  // now handle the left/right closest part
3795 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3796  array_t new_value = (array_t) coord;
3797  array_t prev_value = shared_ptr[value_count_weights + part * 2 + 1];
3798  while(new_value < prev_value) {
3799  prev_value = Kokkos::atomic_compare_exchange(
3800  &shared_ptr[value_count_weights + part * 2 + 1],
3801  prev_value, new_value);
3802  }
3803  prev_value = shared_ptr[value_count_weights + part * 2 + 2];
3804  while(new_value > prev_value) {
3805  prev_value = Kokkos::atomic_compare_exchange(
3806  &shared_ptr[value_count_weights + part * 2 + 2],
3807  prev_value, new_value);
3808  }
3809 #else
3810  // note cut to left needs to set right closest and cut to right needs
3811  // to set left closest. It's index +1 and +2 instead of -1 and +0
3812  // because right/left segment is padded with an extra pair at
3813  // begining and end to avoid branching with if checks.
3814  if(coord < threadSum.ptr[value_count_weights + part * 2 + 1]) {
3815  threadSum.ptr[value_count_weights + part * 2 + 1] = coord;
3816  }
3817  if(coord > threadSum.ptr[value_count_weights + part * 2 + 2]) {
3818  threadSum.ptr[value_count_weights + part * 2 + 2] = coord;
3819  }
3820 #endif
3821 
3822  break;
3823  }
3824  else if(part != num_cuts) {
3825  if(coord < b + sEpsilon && coord > b - sEpsilon) {
3826  // Note if on cut we set right/left closest to the cut itself
3827  // but we add +2 because we buffered the area with an extra slot
3828  // to reduce cuda branching. So it's +2, +3 instead of +0, +1.
3829 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3830  Kokkos::atomic_add(&shared_ptr[part*2+1], w);
3831  shared_ptr[value_count_weights + part * 2 + 2] = b;
3832  shared_ptr[value_count_weights + part * 2 + 3] = b;
3833 #else
3834  threadSum.ptr[part*2+1] += w;
3835  threadSum.ptr[value_count_weights + part * 2 + 2] = b;
3836  threadSum.ptr[value_count_weights + part * 2 + 3] = b;
3837 #endif
3838 
3839  parts(i) = part*2+1;
3840 
3841  // Need to scan up for any other cuts of same coordinate
3842  // This is costly but it's only relevant for the fix4785 test
3843  // which loads a lot of coordinates on the same point, so without
3844  // this our cuts would all just sit at 0.
3845  part_t base_b = part;
3846  scalar_t base_coord = cut_coordinates(base_b);
3847  part += 1;
3848  while(part < num_cuts) {
3849  b = cut_coordinates(part);
3850  scalar_t delta = b - base_coord;
3851  if(delta < 0) delta = -delta;
3852  if(delta < sEpsilon) {
3853  // Note if on cut we set right/left closest to the cut itself
3854  // but we add +2 because we buffered the area with an extra slot
3855  // to reduce cuda branching. So it's +2, +3 instead of +0, +1.
3856 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3857  Kokkos::atomic_add(&shared_ptr[part*2+1], w);
3858  shared_ptr[value_count_weights + part * 2 + 2] = b;
3859  shared_ptr[value_count_weights + part * 2 + 3] = b;
3860 #else
3861  threadSum.ptr[part*2+1] += w;
3862  threadSum.ptr[value_count_weights + part * 2 + 2] = b;
3863  threadSum.ptr[value_count_weights + part * 2 + 3] = b;
3864 #endif
3865  }
3866  else { break; }
3867  ++part;
3868  }
3869  part = base_b - 1;
3870  while(part >= 0) {
3871  b = cut_coordinates(part);
3872  scalar_t delta = b - base_coord;
3873  if(delta < 0) delta = -delta;
3874  if(delta < sEpsilon) {
3875  // Note if on cut we set right/left closest to the cut itself
3876  // but we add +2 because we buffered the area with an extra slot
3877  // to reduce cuda branching. So it's +2, +3 instead of +0, +1.
3878 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3879  Kokkos::atomic_add(&shared_ptr[part*2+1], w);
3880  shared_ptr[value_count_weights + part * 2 + 2] = b;
3881  shared_ptr[value_count_weights + part * 2 + 3] = b;
3882 #else
3883  threadSum.ptr[part*2+1] += w;
3884  threadSum.ptr[value_count_weights + part * 2 + 2] = b;
3885  threadSum.ptr[value_count_weights + part * 2 + 3] = b;
3886 #endif
3887  }
3888  else { break; }
3889  --part;
3890  }
3891 
3892  break;
3893  }
3894  }
3895 
3896  if(loop_count != 0) {
3897  // subsequent loops can just step towards target
3898  if(coord < b) {
3899  part -= 1;
3900  }
3901  else {
3902  part += 1;
3903  }
3904  }
3905  else {
3906  // initial loop binary search
3907  if(coord < b) {
3908  if(part == lower + 1) {
3909  part = lower;
3910  }
3911  else {
3912  upper = part - 1;
3913  part -= (part - lower)/2;
3914  }
3915  }
3916  else if(part == upper - 1) {
3917  part = upper;
3918  }
3919  else {
3920  lower = part + 1;
3921  part += (upper - part)/2;
3922  }
3923  }
3924  }
3925 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3926  });
3927 #else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3928  }, arraySumReducer);
3929 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3930 
3931  teamMember.team_barrier();
3932 
3933  // collect all the team's results
3934 #if (__cplusplus > 201703L)
3935  Kokkos::single(Kokkos::PerTeam(teamMember), [=, this] () {
3936 #else
3937  Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
3938 #endif
3939  for(int n = 0; n < value_count_weights; ++n) {
3940 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3941  Kokkos::atomic_add(&current_part_weights(n),
3942  static_cast<double>(shared_ptr[n]));
3943 #else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3944  teamSum[n] += array.ptr[n];
3945 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3946  }
3947 
3948 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3949  int insert_left = 0;
3950  int insert_right = 0;
3951 #endif
3952 
3953  for(int n = 2 + value_count_weights;
3954  n < value_count_weights + value_count_rightleft - 2; n += 2) {
3955 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3956  scalar_t new_value = shared_ptr[n+1];
3957  scalar_t prev_value = current_right_closest(insert_right);
3958  while(new_value < prev_value) {
3959  prev_value = Kokkos::atomic_compare_exchange(
3960  &current_right_closest(insert_right), prev_value, new_value);
3961  }
3962 
3963  new_value = shared_ptr[n];
3964  prev_value = current_left_closest(insert_left);
3965  while(new_value > prev_value) {
3966  prev_value = Kokkos::atomic_compare_exchange(
3967  &current_left_closest(insert_left), prev_value, new_value);
3968  }
3969 
3970  ++insert_left;
3971  ++insert_right;
3972 #else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3973  if(array.ptr[n] > teamSum[n]) {
3974  teamSum[n] = array.ptr[n];
3975  }
3976  if(array.ptr[n+1] < teamSum[n+1]) {
3977  teamSum[n+1] = array.ptr[n+1];
3978  }
3979 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3980  }
3981  });
3982 
3983  teamMember.team_barrier();
3984  }
3985 
3986 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
3987  KOKKOS_INLINE_FUNCTION
3988  void join(value_type dst, const value_type src) const {
3989  for(int n = 0; n < value_count_weights; ++n) {
3990  dst[n] += src[n];
3991  }
3992 
3993  for(int n = value_count_weights + 2;
3994  n < value_count_weights + value_count_rightleft - 2; n += 2) {
3995  if(src[n] > dst[n]) {
3996  dst[n] = src[n];
3997  }
3998  if(src[n+1] < dst[n+1]) {
3999  dst[n+1] = src[n+1];
4000  }
4001  }
4002  }
4003 
4004  KOKKOS_INLINE_FUNCTION void init (value_type dst) const {
4005  for(int n = 0; n < value_count_weights; ++n) {
4006  dst[n] = 0;
4007  }
4008 
4009  for(int n = value_count_weights;
4010  n < value_count_weights + value_count_rightleft; n += 2) {
4011  dst[n] = -max_scalar;
4012  dst[n+1] = max_scalar;
4013  }
4014  }
4015 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4016 };
4017 
4025 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4026  typename mj_part_t, typename mj_node_t>
4027 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t,mj_part_t, mj_node_t>::
4028  mj_1D_part_get_part_weights(
4029  mj_part_t current_concurrent_num_parts,
4030  mj_part_t current_work_part,
4031  Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
4032  int loop_count)
4033 {
4034  auto local_is_cut_line_determined = is_cut_line_determined;
4035  auto local_thread_part_weights = thread_part_weights;
4036  auto local_thread_cut_left_closest_point = thread_cut_left_closest_point;
4037  auto local_thread_cut_right_closest_point = thread_cut_right_closest_point;
4038 
4039  // Create some locals so we don't use this inside the kernels
4040  // which causes problems
4041  auto local_sEpsilon = this->sEpsilon;
4042  auto local_assigned_part_ids = this->assigned_part_ids;
4043  auto local_coordinate_permutations = this->coordinate_permutations;
4044  auto local_mj_weights = this->mj_weights;
4045  auto local_part_xadj = this->part_xadj;
4046  auto local_global_min_max_coord_total_weight =
4047  this->global_min_max_coord_total_weight;
4048 
4049  typedef Kokkos::TeamPolicy<typename mj_node_t::execution_space> policy_t;
4050 
4051  auto local_device_num_partitioning_in_current_dim =
4052  device_num_partitioning_in_current_dim;
4053 
4054  Kokkos::deep_copy(device_incomplete_cut_count, this->incomplete_cut_count);
4055  auto local_device_incomplete_cut_count = device_incomplete_cut_count;
4056 
4057  mj_part_t total_part_shift = 0;
4058 
4059  mj_part_t concurrent_cut_shifts = 0;
4060  for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
4061  Kokkos::View<mj_scalar_t *, device_t> local_temp_cut_coords =
4062  Kokkos::subview(temp_cut_coords, std::pair<mj_lno_t, mj_lno_t>(
4063  concurrent_cut_shifts, temp_cut_coords.size()));
4064 
4065  mj_part_t num_parts =
4066  host_num_partitioning_in_current_dim(current_work_part + kk);
4067  mj_part_t num_cuts = num_parts - 1;
4068  mj_part_t total_part_count = num_parts + num_cuts;
4069  mj_part_t weight_array_length = num_cuts + num_parts;
4070 
4071  // for right/left closest + buffer cut on either side
4072  mj_part_t right_left_array_length = (num_cuts + 2) * 2;
4073 
4074  if(this->incomplete_cut_count(kk) == 0) {
4075  total_part_shift += total_part_count;
4076  concurrent_cut_shifts += num_cuts;
4077  continue;
4078  }
4079 
4080  // if not set use 60 - was initial testing amount but somewhat arbitrary
4081  auto policy_ReduceWeightsFunctor = policy_t(
4082  mj_num_teams ? mj_num_teams : 60, Kokkos::AUTO);
4083 
4084 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4085  int total_array_length =
4086  weight_array_length + right_left_array_length;
4087 #endif
4088 
4089  // Using float here caused some numerical errors for coord on cut calculations.
4090  // Probably that can be fixed with proper epsilon adjustment but since cuda
4091  // doesn't reduce right now the shared memory pressure is no longer relevant.
4092  // Just use scalar_t to match the original algorithm.
4093  typedef mj_scalar_t array_t;
4094 
4095 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4096  Kokkos::View<array_t*, Kokkos::HostSpace> reduce_array("reduce_array", total_array_length);
4097 #endif // KOKKOS_ENABLE_CUDA && KOKKOS_ENABLE_HIP
4098 
4099  int offset_cuts = 0;
4100  for(int kk2 = 0; kk2 < kk; ++kk2) {
4101  offset_cuts +=
4102  host_num_partitioning_in_current_dim(current_work_part + kk2) - 1;
4103  }
4104  Kokkos::View<double *, device_t> my_current_part_weights =
4105  Kokkos::subview(local_thread_part_weights,
4106  std::pair<mj_lno_t, mj_lno_t>(total_part_shift,
4107  total_part_shift + total_part_count));
4108  Kokkos::View<mj_scalar_t *, device_t> my_current_left_closest =
4109  Kokkos::subview(local_thread_cut_left_closest_point,
4110  std::pair<mj_lno_t, mj_lno_t>(
4111  offset_cuts,
4112  local_thread_cut_left_closest_point.size()));
4113  Kokkos::View<mj_scalar_t *, device_t> my_current_right_closest =
4114  Kokkos::subview(local_thread_cut_right_closest_point,
4115  std::pair<mj_lno_t, mj_lno_t>(
4116  offset_cuts,
4117  local_thread_cut_right_closest_point.size()));
4118 
4119  array_t max_scalar = std::numeric_limits<array_t>::max();
4120 
4121 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4122  // initialize values
4123  Kokkos::parallel_for(
4124  Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4125  KOKKOS_LAMBDA (int dummy) {
4126  for(int n = 0; n < weight_array_length; ++n) {
4127  my_current_part_weights(n) = 0;
4128  }
4129  for(int n = 0; n < num_cuts; ++n) {
4130  my_current_left_closest(n) = -max_scalar;
4131  my_current_right_closest(n) = max_scalar;
4132  }
4133  });
4134 #endif
4135 
4136  mj_part_t concurrent_current_part =
4137  current_work_part + kk;
4138 
4139  ReduceWeightsFunctor<policy_t, mj_scalar_t, mj_part_t, mj_lno_t,
4140  typename mj_node_t::device_type, array_t>
4141  teamFunctor(
4142  loop_count,
4143  max_scalar,
4144  concurrent_current_part,
4145  num_cuts,
4146  current_work_part,
4147  current_concurrent_num_parts,
4148  right_left_array_length,
4149  weight_array_length,
4150  coordinate_permutations,
4151  mj_current_dim_coords,
4152  mj_weights,
4153  assigned_part_ids,
4154  local_temp_cut_coords,
4155  part_xadj,
4156  mj_uniform_weights(0), // host and currently only relevant to slot 0
4157  sEpsilon
4158 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4159  ,my_current_part_weights,
4160  my_current_left_closest,
4161  my_current_right_closest
4162 #endif
4163  );
4164 
4165 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4166  Kokkos::parallel_for(policy_ReduceWeightsFunctor, teamFunctor);
4167 #else
4168  Kokkos::parallel_reduce(policy_ReduceWeightsFunctor,
4169  teamFunctor, reduce_array);
4170  Kokkos::fence();
4171 #endif
4172 
4173 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4174  auto hostArray = Kokkos::create_mirror_view(my_current_part_weights);
4175 
4176  for(int i = 0; i < static_cast<int>(total_part_count); ++i) {
4177  hostArray(i) = reduce_array[i];
4178  }
4179 
4180  Kokkos::deep_copy(my_current_part_weights, hostArray);
4181 
4182  auto hostLeftArray = Kokkos::create_mirror_view(my_current_left_closest);
4183  auto hostRightArray = Kokkos::create_mirror_view(my_current_right_closest);
4184  for(mj_part_t cut = 0; cut < num_cuts; ++cut) {
4185  hostLeftArray(cut) = reduce_array[weight_array_length + (cut+1)*2+0];
4186  hostRightArray(cut) = reduce_array[weight_array_length + (cut+1)*2+1];
4187  }
4188  Kokkos::deep_copy(my_current_left_closest, hostLeftArray);
4189  Kokkos::deep_copy(my_current_right_closest, hostRightArray);
4190 #endif
4191 
4192  total_part_shift += total_part_count;
4193  concurrent_cut_shifts += num_cuts;
4194  }
4195 
4196  auto local_temp_cut_coords = temp_cut_coords;
4197 
4198  Kokkos::parallel_for(
4199  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
4200  (0, current_concurrent_num_parts), KOKKOS_LAMBDA(mj_part_t kk) {
4201  mj_part_t num_parts = local_device_num_partitioning_in_current_dim(
4202  current_work_part + kk);
4203  mj_part_t num_cuts = num_parts - 1;
4204  mj_part_t total_part_count = num_parts + num_cuts;
4205 
4206  if(local_device_incomplete_cut_count(kk) > 0) {
4207  // get the prefix sum
4208  // This is an inefficiency but not sure if it matters much
4209  size_t offset = 0;
4210  size_t offset_cuts = 0;
4211  for(mj_part_t kk2 = 0; kk2 < kk; ++kk2) {
4212  auto num_parts_kk2 = local_device_num_partitioning_in_current_dim(
4213  current_work_part + kk2);
4214  offset += num_parts_kk2 * 2 - 1;
4215  offset_cuts += num_parts_kk2 - 1;
4216  }
4217 
4218  for(mj_part_t i = 1; i < total_part_count; ++i) {
4219  // check for cuts sharing the same position; all cuts sharing a position
4220  // have the same weight == total weight for all cuts sharing the
4221  // position. Don't want to accumulate that total weight more than once.
4222  if(i % 2 == 0 && i > 1 && i < total_part_count - 1 &&
4223  std::abs(local_temp_cut_coords(offset_cuts + i / 2) -
4224  local_temp_cut_coords(offset_cuts + i /2 - 1))
4225  < local_sEpsilon) {
4226  // i % 2 = 0 when part i represents the cut coordinate.
4227  // if it is a cut, and if next cut also has the same coordinate, then
4228  // dont addup.
4229  local_thread_part_weights(offset + i)
4230  = local_thread_part_weights(offset + i-2);
4231  continue;
4232  }
4233 
4234  // otherwise do the prefix sum.
4235  local_thread_part_weights(offset + i) +=
4236  local_thread_part_weights(offset + i-1);
4237  }
4238  }
4239  });
4240 }
4241 
4249 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4250  typename mj_part_t, typename mj_node_t>
4251 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
4252  mj_combine_rightleft_and_weights(
4253  mj_part_t current_work_part,
4254  mj_part_t current_concurrent_num_parts)
4255 {
4256  auto local_thread_part_weights = this->thread_part_weights;
4257  auto local_is_cut_line_determined = this->is_cut_line_determined;
4258  auto local_thread_cut_left_closest_point =
4259  this->thread_cut_left_closest_point;
4260  auto local_thread_cut_right_closest_point =
4261  this->thread_cut_right_closest_point;
4262  auto local_total_part_weight_left_right_closests =
4263  this->total_part_weight_left_right_closests;
4264  auto local_device_num_partitioning_in_current_dim =
4265  device_num_partitioning_in_current_dim;
4266  Kokkos::parallel_for(
4267  Kokkos::RangePolicy<typename mj_node_t::execution_space, int>(0,1),
4268  KOKKOS_LAMBDA (int dummy) {
4269 
4270  size_t tlr_array_shift = 0;
4271  mj_part_t cut_shift = 0;
4272  size_t total_part_array_shift = 0;
4273 
4274  // iterate for all concurrent parts to find the left and right closest
4275  // points in the process.
4276  for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i) {
4277 
4278  mj_part_t num_parts_in_part =
4279  local_device_num_partitioning_in_current_dim(current_work_part + i);
4280  mj_part_t num_cuts_in_part = num_parts_in_part - 1;
4281  size_t num_total_part_in_part =
4282  num_parts_in_part + size_t (num_cuts_in_part);
4283 
4284  // iterate for cuts in a single part.
4285  for(int ii = 0; ii < num_cuts_in_part; ++ii) {
4286  mj_part_t next = tlr_array_shift + ii;
4287  mj_part_t cut_index = cut_shift + ii;
4288 
4289  if(!local_is_cut_line_determined(cut_index)) {
4290  mj_scalar_t left_closest_in_process =
4291  local_thread_cut_left_closest_point(cut_index);
4292  mj_scalar_t right_closest_in_process =
4293  local_thread_cut_right_closest_point(cut_index);
4294 
4295  // store the left and right closes points.
4296  local_total_part_weight_left_right_closests(
4297  num_total_part_in_part + next) = left_closest_in_process;
4298 
4299  local_total_part_weight_left_right_closests(
4300  num_total_part_in_part + num_cuts_in_part + next) =
4301  right_closest_in_process;
4302  }
4303  }
4304 
4305  for(size_t j = 0; j < num_total_part_in_part; ++j) {
4306  mj_part_t cut_ind = j / 2 + cut_shift;
4307 
4308  // need to check j != num_total_part_in_part - 1
4309  // which is same as j/2 != num_cuts_in_part.
4310  // we cannot check it using cut_ind, because of the concurrent part
4311  // concantanetion.
4312  if(j == num_total_part_in_part - 1 ||
4313  !local_is_cut_line_determined(cut_ind)) {
4314  double pwj = local_thread_part_weights(total_part_array_shift + j);
4315  local_total_part_weight_left_right_closests(tlr_array_shift + j) = pwj;
4316  }
4317  }
4318 
4319  // set the shift position in the arrays
4320  cut_shift += num_cuts_in_part;
4321  tlr_array_shift += num_total_part_in_part + 2 * num_cuts_in_part;
4322  total_part_array_shift += num_total_part_in_part;
4323  }
4324  });
4325 }
4326 
4339 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4340  typename mj_part_t, typename mj_node_t>
4341 KOKKOS_INLINE_FUNCTION
4342 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
4343  mj_node_t>::mj_calculate_new_cut_position(mj_scalar_t cut_upper_bound,
4344  mj_scalar_t cut_lower_bound,
4345  mj_scalar_t cut_upper_weight,
4346  mj_scalar_t cut_lower_weight,
4347  mj_scalar_t expected_weight,
4348  mj_scalar_t &new_cut_position,
4349  mj_scalar_t sEpsilon) {
4350 
4351  if(std::abs(cut_upper_bound - cut_lower_bound) < sEpsilon) {
4352  new_cut_position = cut_upper_bound; //or lower bound does not matter.
4353  }
4354 
4355  if(std::abs(cut_upper_weight - cut_lower_weight) < sEpsilon) {
4356  new_cut_position = cut_lower_bound;
4357  }
4358 
4359  mj_scalar_t coordinate_range = (cut_upper_bound - cut_lower_bound);
4360  mj_scalar_t weight_range = (cut_upper_weight - cut_lower_weight);
4361  mj_scalar_t my_weight_diff = (expected_weight - cut_lower_weight);
4362 
4363  mj_scalar_t required_shift = (my_weight_diff / weight_range);
4364  int scale_constant = 20;
4365  int shiftint= int (required_shift * scale_constant);
4366  if(shiftint == 0) shiftint = 1;
4367  required_shift = mj_scalar_t (shiftint) / scale_constant;
4368  new_cut_position = coordinate_range * required_shift + cut_lower_bound;
4369 }
4370 
4371 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4372 
4373 template<class policy_t, class scalar_t>
4375 
4380 
4381  KOKKOS_INLINE_FUNCTION ArrayReducer(
4382  value_type &val,
4383  int mj_value_count) :
4384  value(&val),
4385  value_count(mj_value_count)
4386  {}
4387 
4388  KOKKOS_INLINE_FUNCTION
4390  return *value;
4391  }
4392 
4393  KOKKOS_INLINE_FUNCTION
4394  void join(value_type& dst, const value_type& src) const {
4395  for(int n = 0; n < value_count; ++n) {
4396  dst.ptr[n] += src.ptr[n];
4397  }
4398  }
4399 
4400  KOKKOS_INLINE_FUNCTION void init (value_type& dst) const {
4401  dst.ptr = value->ptr; // must update ptr
4402  for(int n = 0; n < value_count; ++n) {
4403  dst.ptr[n] = 0;
4404  }
4405  }
4406 };
4407 
4408 #endif
4409 
4410 template<class policy_t, class scalar_t, class part_t, class index_t,
4411  class device_t, class array_t>
4413  typedef typename policy_t::member_type member_type;
4414  typedef Kokkos::View<scalar_t*> scalar_view_t;
4415 
4416 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4417  typedef array_t value_type[];
4418 #endif
4419 
4422  Kokkos::View<index_t*, device_t> permutations;
4423  Kokkos::View<scalar_t *, device_t> coordinates;
4424  Kokkos::View<part_t*, device_t> parts;
4425  Kokkos::View<index_t *, device_t> part_xadj;
4426  Kokkos::View<index_t *, device_t> track_on_cuts;
4427 
4428 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4429  Kokkos::View<int *, device_t> local_point_counts;
4430 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4431 
4433  part_t mj_concurrent_current_part,
4434  part_t mj_weight_array_size,
4435  Kokkos::View<index_t*, device_t> & mj_permutations,
4436  Kokkos::View<scalar_t *, device_t> & mj_coordinates,
4437  Kokkos::View<part_t*, device_t> & mj_parts,
4438  Kokkos::View<index_t *, device_t> & mj_part_xadj,
4439  Kokkos::View<index_t *, device_t> & mj_track_on_cuts
4440 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4441  ,Kokkos::View<int *, device_t> & mj_local_point_counts
4442 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4443  ) :
4444  concurrent_current_part(mj_concurrent_current_part),
4445  value_count(mj_weight_array_size),
4446  permutations(mj_permutations),
4447  coordinates(mj_coordinates),
4448  parts(mj_parts),
4449  part_xadj(mj_part_xadj),
4450  track_on_cuts(mj_track_on_cuts)
4451 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4452  ,local_point_counts(mj_local_point_counts)
4453 #endif
4454  {
4455  }
4456 
4457  size_t team_shmem_size (int team_size) const {
4458 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4459  int result = sizeof(array_t) * (value_count);
4460 #else
4461  int result = sizeof(array_t) * (value_count) * team_size;
4462 #endif
4463 
4464  // pad this to a multiple of 8 or it will run corrupt
4465  int remainder = result % 8;
4466  if(remainder != 0) {
4467  result += 8 - remainder;
4468  }
4469  return result;
4470  }
4471 
4472  KOKKOS_INLINE_FUNCTION
4473 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4474  void operator() (const member_type & teamMember) const {
4475 #else
4476  void operator() (const member_type & teamMember, value_type teamSum) const {
4477 #endif
4478  index_t all_begin = (concurrent_current_part == 0) ? 0 :
4479  part_xadj(concurrent_current_part - 1);
4480  index_t all_end = part_xadj(concurrent_current_part);
4481 
4482  index_t num_working_points = all_end - all_begin;
4483  int num_teams = teamMember.league_size();
4484 
4485  index_t stride = num_working_points / num_teams;
4486  if((num_working_points % num_teams) > 0) {
4487  stride += 1; // make sure we have coverage for the final points
4488  }
4489 
4490  index_t begin = all_begin + stride * teamMember.league_rank();
4491  index_t end = begin + stride;
4492  if(end > all_end) {
4493  end = all_end; // the last team may have less work than the other teams
4494  }
4495 
4496  int track_on_cuts_insert_index = track_on_cuts.size() - 1;
4497 
4498  // create the team shared data - each thread gets one of the arrays
4499 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4500  size_t sh_mem_size = sizeof(array_t) * (value_count);
4501 #else
4502  size_t sh_mem_size =
4503  sizeof(array_t) * (value_count) * teamMember.team_size();
4504 #endif
4505 
4506  array_t * shared_ptr = (array_t *) teamMember.team_shmem().get_shmem(
4507  sh_mem_size);
4508 
4509 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4510  // init the shared array to 0
4511  Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
4512  for(int n = 0; n < value_count; ++n) {
4513  shared_ptr[n] = 0;
4514  }
4515  });
4516  teamMember.team_barrier();
4517 
4518  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, begin, end),
4519  [=] (index_t ii) {
4520 #else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4521  // select the array for this thread
4522  Zoltan2_MJArrayType<array_t> array(&shared_ptr[teamMember.team_rank() *
4523  (value_count)]);
4524 
4525  // create reducer which handles the Zoltan2_MJArrayType class
4526  ArrayReducer<policy_t, array_t> arrayReducer(array, value_count);
4527 
4528  Kokkos::parallel_reduce(
4529  Kokkos::TeamThreadRange(teamMember, begin, end),
4530 #if (__cplusplus > 201703L)
4531  [=, this] (size_t ii, Zoltan2_MJArrayType<array_t>& threadSum) {
4532 #else
4533  [=] (size_t ii, Zoltan2_MJArrayType<array_t>& threadSum) {
4534 #endif
4535 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4536 
4537  index_t coordinate_index = permutations(ii);
4538  part_t place = parts(coordinate_index);
4539  part_t part = place / 2;
4540  if(place % 2 == 0) {
4541 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4542  Kokkos::atomic_add(&shared_ptr[part], 1);
4543 #else
4544  threadSum.ptr[part] += 1;
4545 #endif
4546 
4547  parts(coordinate_index) = part;
4548  }
4549  else {
4550  // fill a tracking array so we can process these slower points
4551  // in next cycle
4552  index_t set_index = Kokkos::atomic_fetch_add(
4553  &track_on_cuts(track_on_cuts_insert_index), 1);
4554  track_on_cuts(set_index) = ii;
4555  }
4556 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4557  });
4558 #else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4559  }, arrayReducer);
4560 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4561 
4562  teamMember.team_barrier();
4563 
4564  // collect all the team's results
4565 #if (__cplusplus > 201703L)
4566  Kokkos::single(Kokkos::PerTeam(teamMember), [=, this] () {
4567 #else
4568  Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
4569 #endif
4570  for(int n = 0; n < value_count; ++n) {
4571 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4572  Kokkos::atomic_add(&local_point_counts(n), shared_ptr[n]);
4573 #else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4574  teamSum[n] += array.ptr[n];
4575 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4576  }
4577  });
4578 
4579  teamMember.team_barrier();
4580  }
4581 
4582 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4583 
4584  KOKKOS_INLINE_FUNCTION
4585  void join(value_type dst, const value_type src) const {
4586  for(int n = 0; n < value_count; ++n) {
4587  dst[n] += src[n];
4588  }
4589  }
4590 
4591  KOKKOS_INLINE_FUNCTION void init (value_type dst) const {
4592  for(int n = 0; n < value_count; ++n) {
4593  dst[n] = 0;
4594  }
4595  }
4596 #endif
4597 };
4598 
4614 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4615  typename mj_part_t, typename mj_node_t>
4616 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
4617 mj_create_new_partitions(
4618  mj_part_t num_parts,
4619  mj_part_t current_concurrent_work_part,
4620  Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
4621  Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
4622  Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
4623  Kokkos::View<mj_lno_t *, device_t> & out_part_xadj)
4624 {
4625  // Get locals for cuda
4626  auto local_thread_part_weight_work = this->thread_part_weight_work;
4627  auto local_point_counts = this->thread_point_counts;
4628  auto local_distribute_points_on_cut_lines =
4629  this->distribute_points_on_cut_lines;
4630  auto local_thread_cut_line_weight_to_put_left =
4631  this->thread_cut_line_weight_to_put_left;
4632  auto local_sEpsilon = this->sEpsilon;
4633  auto local_coordinate_permutations = this->coordinate_permutations;
4634  auto local_mj_weights = this->mj_weights;
4635  auto local_assigned_part_ids = this->assigned_part_ids;
4636  auto local_new_coordinate_permutations = this->new_coordinate_permutations;
4637 
4638  mj_part_t num_cuts = num_parts - 1;
4639 
4640  Kokkos::parallel_for(
4641  Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4642  KOKKOS_LAMBDA(int dummy) {
4643 
4644  if(local_distribute_points_on_cut_lines) {
4645  for(int i = 0; i < num_cuts; ++i) {
4646  mj_scalar_t left_weight = used_local_cut_line_weight_to_left(i);
4647  if(left_weight > local_sEpsilon) {
4648  // the weight of thread ii on cut.
4649  mj_scalar_t thread_ii_weight_on_cut =
4650  local_thread_part_weight_work(i * 2 + 1) -
4651  local_thread_part_weight_work(i * 2);
4652 
4653  if(thread_ii_weight_on_cut < left_weight) {
4654  // if left weight is bigger than threads weight on cut.
4655  local_thread_cut_line_weight_to_put_left(i) =
4656  thread_ii_weight_on_cut;
4657  }
4658  else {
4659  // if thread's weight is bigger than space, then put only a portion.
4660  local_thread_cut_line_weight_to_put_left(i) = left_weight;
4661  }
4662  left_weight -= thread_ii_weight_on_cut;
4663  }
4664  else {
4665  local_thread_cut_line_weight_to_put_left(i) = 0;
4666  }
4667  }
4668 
4669  // this is a special case. If cutlines share the same coordinate,
4670  // their weights are equal. We need to adjust the ratio for that.
4671  for(mj_part_t i = num_cuts - 1; i > 0 ; --i) {
4672  if(std::abs(current_concurrent_cut_coordinate(i) -
4673  current_concurrent_cut_coordinate(i -1)) < local_sEpsilon) {
4674  local_thread_cut_line_weight_to_put_left(i) -=
4675  local_thread_cut_line_weight_to_put_left(i - 1);
4676  }
4677  local_thread_cut_line_weight_to_put_left(i) =
4678  static_cast<long long>((local_thread_cut_line_weight_to_put_left(i) +
4679  least_signifiance) * significance_mul) /
4680  static_cast<mj_scalar_t>(significance_mul);
4681  }
4682  }
4683 
4684  for(mj_part_t i = 0; i < num_parts; ++i) {
4685  local_point_counts(i) = 0;
4686  }
4687  });
4688 
4689  mj_lno_t coordinate_begin_index =
4690  current_concurrent_work_part == 0 ? 0 :
4691  host_part_xadj(current_concurrent_work_part - 1);
4692  mj_lno_t coordinate_end_index =
4693  host_part_xadj(current_concurrent_work_part);
4694 
4695  mj_lno_t total_on_cut;
4696  Kokkos::parallel_reduce("Get total_on_cut",
4697  Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (
4698  coordinate_begin_index, coordinate_end_index),
4699  KOKKOS_LAMBDA(int ii, mj_lno_t & val) {
4700  mj_lno_t coordinate_index = local_coordinate_permutations(ii);
4701  mj_part_t coordinate_assigned_place =
4702  local_assigned_part_ids(coordinate_index);
4703  if(coordinate_assigned_place % 2 == 1) {
4704  val += 1;
4705  }
4706  }, total_on_cut);
4707 
4708  Kokkos::View<mj_lno_t *, device_t> track_on_cuts;
4709  if(total_on_cut > 0) {
4710  track_on_cuts = Kokkos::View<mj_lno_t *, device_t>(
4711  "track_on_cuts", // would do WithoutInitialization but need last init to 0
4712  total_on_cut + 1); // extra index to use for tracking
4713  }
4714 
4715  // here we need to parallel reduce an array to count coords in each part
4716  // atomically adding, especially for low part count would kill us
4717  // in the original setup we kept arrays allocated for each thread but for
4718  // the cuda version we'd like to avoid allocating N arrays for the number
4719  // of teams/threads which would be complicated based on running openmp or
4720  // cuda.
4721  typedef Kokkos::TeamPolicy<typename mj_node_t::execution_space> policy_t;
4722 
4723  // if not set use 60 - somewhat arbitrary based on initial performance tests
4724  int use_num_teams = mj_num_teams ? mj_num_teams : 60;
4725 
4726  auto policy_ReduceFunctor = policy_t(use_num_teams, Kokkos::AUTO);
4727  typedef int array_t;
4728 
4729  // just need parts - on the cuts will be handled in a separate serial
4730  // call after this.
4731 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4732  Kokkos::View<array_t*, Kokkos::HostSpace> reduce_array("reduce_array", num_parts);
4733 #endif
4734 
4735  ReduceArrayFunctor<policy_t, mj_scalar_t, mj_part_t, mj_lno_t,
4736  typename mj_node_t::device_type, array_t>teamFunctor(
4737  current_concurrent_work_part,
4738  num_parts,
4739  coordinate_permutations,
4740  mj_current_dim_coords,
4741  assigned_part_ids,
4742  part_xadj,
4743  track_on_cuts
4744 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4745  ,local_point_counts
4746 #endif
4747  );
4748 
4749 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4750  Kokkos::parallel_for(policy_ReduceFunctor, teamFunctor);
4751 #else
4752  Kokkos::parallel_reduce(policy_ReduceFunctor, teamFunctor, reduce_array);
4753  Kokkos::fence();
4754 #endif
4755 
4756 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4757  for(mj_part_t part = 0; part < num_parts; ++part) {
4758  local_point_counts(part) = reduce_array[part];
4759  }
4760 #endif
4761 
4762  // the last member is utility used for atomically inserting the values.
4763  // Sorting here avoids potential indeterminancy in the partitioning results
4764  if(track_on_cuts.size() > 0) { // size 0 unused, or size is minimum of 2
4765  auto track_on_cuts_sort = Kokkos::subview(track_on_cuts,
4766  std::pair<mj_lno_t, mj_lno_t>(0, track_on_cuts.size() - 1)); // do not sort last element
4767  Kokkos::sort(track_on_cuts_sort);
4768  }
4769 
4770  bool uniform_weights0 = this->mj_uniform_weights(0);
4771  Kokkos::parallel_for(
4772  Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4773  KOKKOS_LAMBDA (int dummy) {
4774 
4775  for(int j = 0; j < total_on_cut; ++j) {
4776  int ii = track_on_cuts(j);
4777  mj_lno_t coordinate_index = local_coordinate_permutations(ii);
4778  mj_scalar_t coordinate_weight = uniform_weights0 ? 1 :
4779  local_mj_weights(coordinate_index,0);
4780  mj_part_t coordinate_assigned_place =
4781  local_assigned_part_ids(coordinate_index);
4782  mj_part_t coordinate_assigned_part = coordinate_assigned_place / 2;
4783  // if it is on the cut.
4784  if(local_distribute_points_on_cut_lines &&
4785  local_thread_cut_line_weight_to_put_left(
4786  coordinate_assigned_part) > local_sEpsilon) {
4787  // if the rectilinear partitioning is allowed,
4788  // and the thread has still space to put on the left of the cut
4789  // then thread puts the vertex to left.
4790  local_thread_cut_line_weight_to_put_left(
4791  coordinate_assigned_part) -= coordinate_weight;
4792  // if putting the vertex to left increased the weight more
4793  // than expected, and if the next cut is on the same coordinate,
4794  // then we need to adjust how much weight next cut puts to its left as
4795  // well, in order to take care of the imbalance.
4796  if(local_thread_cut_line_weight_to_put_left(
4797  coordinate_assigned_part) < 0 && coordinate_assigned_part <
4798  num_cuts - 1 &&
4799  std::abs(current_concurrent_cut_coordinate(
4800  coordinate_assigned_part+1) -
4801  current_concurrent_cut_coordinate(
4802  coordinate_assigned_part)) < local_sEpsilon)
4803  {
4804  local_thread_cut_line_weight_to_put_left(
4805  coordinate_assigned_part + 1) +=
4806  local_thread_cut_line_weight_to_put_left(
4807  coordinate_assigned_part);
4808  }
4809  ++local_point_counts(coordinate_assigned_part);
4810  local_assigned_part_ids(coordinate_index) =
4811  coordinate_assigned_part;
4812  }
4813  else {
4814  // if there is no more space on the left, put the coordinate to the
4815  // right of the cut.
4816  ++coordinate_assigned_part;
4817  // this while loop is necessary when a line is partitioned into more
4818  // than 2 parts.
4819  while(local_distribute_points_on_cut_lines &&
4820  coordinate_assigned_part < num_cuts)
4821  {
4822  // traverse all the cut lines having the same partitiong
4823  if(std::abs(current_concurrent_cut_coordinate(
4824  coordinate_assigned_part) -
4825  current_concurrent_cut_coordinate(
4826  coordinate_assigned_part - 1)) < local_sEpsilon)
4827  {
4828  // if line has enough space on left, put it there.
4829  if(local_thread_cut_line_weight_to_put_left(
4830  coordinate_assigned_part) > local_sEpsilon &&
4831  local_thread_cut_line_weight_to_put_left(
4832  coordinate_assigned_part) >=
4833  std::abs(local_thread_cut_line_weight_to_put_left(
4834  coordinate_assigned_part) - coordinate_weight))
4835  {
4836  local_thread_cut_line_weight_to_put_left(
4837  coordinate_assigned_part) -= coordinate_weight;
4838  // Again if it put too much on left of the cut,
4839  // update how much the next cut sharing the same coordinate will
4840  // put to its left.
4841  if(local_thread_cut_line_weight_to_put_left(
4842  coordinate_assigned_part) < 0 &&
4843  coordinate_assigned_part < num_cuts - 1 &&
4844  std::abs(current_concurrent_cut_coordinate(
4845  coordinate_assigned_part+1) -
4846  current_concurrent_cut_coordinate(
4847  coordinate_assigned_part)) < local_sEpsilon)
4848  {
4849  local_thread_cut_line_weight_to_put_left(
4850  coordinate_assigned_part + 1) +=
4851  local_thread_cut_line_weight_to_put_left(
4852  coordinate_assigned_part);
4853  }
4854  break;
4855  }
4856  }
4857  else {
4858  break;
4859  }
4860  ++coordinate_assigned_part;
4861  }
4862  local_point_counts(coordinate_assigned_part) += 1;
4863  local_assigned_part_ids(coordinate_index) = coordinate_assigned_part;
4864  }
4865  }
4866 
4867  for(int j = 0; j < num_parts; ++j) {
4868  out_part_xadj(j) = local_point_counts(j);
4869  local_point_counts(j) = 0;
4870 
4871  if(j != 0) {
4872  out_part_xadj(j) += out_part_xadj(j - 1);
4873  local_point_counts(j) += out_part_xadj(j - 1);
4874  }
4875  }
4876  });
4877 
4878  // here we will determine insert indices for N teams
4879  // then all the teams can fill
4880 
4881 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4882 
4883  // This is the fastest so far - just straight atomic writes for CUDA
4884  // However this is not a deterministic result since it is atomic.
4885  // The final result will be deterministic.
4886  Kokkos::parallel_for(
4887  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t> (
4888  coordinate_begin_index, coordinate_end_index),
4889  KOKKOS_LAMBDA (mj_lno_t ii) {
4890  mj_lno_t i = local_coordinate_permutations(ii);
4891  mj_part_t p = local_assigned_part_ids(i);
4892  mj_lno_t idx = Kokkos::atomic_fetch_add(&local_point_counts(p), 1);
4893  local_new_coordinate_permutations(coordinate_begin_index + idx) = i;
4894  });
4895 
4896 #else
4897 
4898 #ifdef KOKKOS_ENABLE_OPENMP
4899  // will return and fix this - revert back to 1 for clear auto testing
4900  const int num_threads = 1; // Kokkos::OpenMP::impl_max_hardware_threads();
4901 #else
4902  const int num_threads = 1;
4903 #endif
4904 
4905  const int num_teams = 1; // cuda is handled above using a different format
4906 
4907  // allow init - we want all 0's first
4908  Kokkos::View<mj_lno_t*, device_t>
4909  point_counter("insert indices", num_teams * num_threads * num_parts);
4910 
4911  // count how many coords per thread
4912  // then we will fill each independently
4913  Kokkos::TeamPolicy<typename mj_node_t::execution_space>
4914  block_policy(num_teams, num_threads);
4915  typedef typename Kokkos::TeamPolicy<typename mj_node_t::execution_space>::
4917  mj_lno_t range = coordinate_end_index - coordinate_begin_index;
4918  mj_lno_t block_size = range / num_teams + 1;
4919  Kokkos::parallel_for(block_policy, KOKKOS_LAMBDA(member_type team_member) {
4920  int team = team_member.league_rank();
4921  int team_offset = team * num_threads * num_parts;
4922  mj_lno_t begin = coordinate_begin_index + team * block_size;
4923  mj_lno_t end = begin + block_size;
4924  if(end > coordinate_end_index) {
4925  end = coordinate_end_index;
4926  }
4927 
4928  Kokkos::parallel_for(Kokkos::TeamThreadRange(team_member, begin, end),
4929  [=] (mj_lno_t ii) {
4930  int thread = team_member.team_rank();
4931  mj_lno_t i = local_coordinate_permutations(ii);
4932  mj_part_t p = local_assigned_part_ids(i);
4933  int index = team_offset + thread * num_parts + p;
4934  ++point_counter(index);
4935  });
4936  });
4937 
4938  // now prefix sum
4939  // we currently have the counts in the slots
4940  // we want the first counter for each part to be 0
4941  // then the rest should be the sum of all the priors
4942  Kokkos::parallel_for(
4943  Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4944  KOKKOS_LAMBDA (int dummy) {
4945  int num_sets = point_counter.size() / num_parts;
4946  for(int set = num_sets - 1; set >= 1; set -=1) {
4947  int base = set * num_parts;
4948  for(int part = 0; part < num_parts; ++part) {
4949  point_counter(base + part) = point_counter(base + part - num_parts);
4950  }
4951  }
4952 
4953  for(int part = 0; part < num_parts; ++part) {
4954  point_counter(part) = 0;
4955  }
4956 
4957  for(int set = 1; set < num_sets; ++set) {
4958  int base = set * num_parts;
4959  for(int part = 0; part < num_parts; ++part) {
4960  point_counter(base + part) += point_counter(base + part - num_parts);
4961  }
4962  }
4963  });
4964 
4965  // now permute
4966  Kokkos::parallel_for(block_policy, KOKKOS_LAMBDA(member_type team_member) {
4967  int team = team_member.league_rank();
4968  int team_offset = team * num_threads * num_parts;
4969  mj_lno_t begin = coordinate_begin_index + team * block_size;
4970  mj_lno_t end = begin + block_size;
4971  if(end > coordinate_end_index) {
4972  end = coordinate_end_index;
4973  }
4974  Kokkos::parallel_for(Kokkos::TeamThreadRange(team_member, begin, end),
4975  [=] (mj_lno_t ii) {
4976  int thread = team_member.team_rank();
4977  mj_lno_t i = local_coordinate_permutations(ii);
4978  mj_part_t p = local_assigned_part_ids(i);
4979  int index = team_offset + thread * num_parts + p;
4980  int set_counter = (point_counter(index)++) + local_point_counts(p);
4981  local_new_coordinate_permutations(coordinate_begin_index + set_counter) = i;
4982  });
4983  });
4984 #endif
4985 }
4986 
5030 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5031  typename mj_part_t, typename mj_node_t>
5032 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
5033  mj_node_t>::mj_get_new_cut_coordinates(
5034  mj_part_t current_concurrent_num_parts,
5035  mj_part_t kk,
5036  const mj_part_t &num_cuts,
5037  const double &used_imbalance_tolerance,
5038  Kokkos::View<mj_scalar_t *, device_t> & current_global_part_weights,
5039  Kokkos::View<mj_scalar_t *, device_t> & current_local_part_weights,
5040  Kokkos::View<mj_scalar_t *, device_t> & current_part_target_weights,
5041  Kokkos::View<bool *, device_t> & current_cut_line_determined,
5042  Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
5043  Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_bounds,
5044  Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bounds,
5045  Kokkos::View<mj_scalar_t *, device_t> & current_global_left_closest_points,
5046  Kokkos::View<mj_scalar_t *, device_t> & current_global_right_closest_points,
5047  Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bound_weights,
5048  Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_weights,
5049  Kokkos::View<mj_scalar_t *, device_t> & new_current_cut_coordinates,
5050  Kokkos::View<mj_scalar_t *, device_t> &
5051  current_part_cut_line_weight_to_put_left,
5052  Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count)
5053 {
5054  Kokkos::deep_copy(device_incomplete_cut_count, this->incomplete_cut_count);
5055 
5056  auto local_device_incomplete_cut_count = device_incomplete_cut_count;
5057  auto local_sEpsilon = sEpsilon;
5058  auto local_distribute_points_on_cut_lines = distribute_points_on_cut_lines;
5059  auto local_global_rectilinear_cut_weight = global_rectilinear_cut_weight;
5060  auto local_process_rectilinear_cut_weight = process_rectilinear_cut_weight;
5061  auto local_global_min_max_coord_total_weight =
5062  global_min_max_coord_total_weight;
5063 
5064  const auto _sEpsilon = this->sEpsilon;
5065  // Note for a 22 part system I tried removing the outer loop
5066  // and doing each sub loop as a simple parallel_for over num_cuts.
5067  // But that was about twice as slow (10ms) as the current form (5ms)
5068  // so I think the overhead of launching the new global parallel kernels
5069  // is costly. This form is just running one team so effectively using
5070  // a single warp to process the cuts. I expect with a lot of parts this
5071  // might need changing.
5072  Kokkos::TeamPolicy<typename mj_node_t::execution_space>
5073  policy_one_team(1, Kokkos::AUTO());
5074  typedef typename Kokkos::TeamPolicy<typename mj_node_t::execution_space>::
5076  Kokkos::parallel_for(policy_one_team, KOKKOS_LAMBDA(member_type team_member) {
5077 
5078  mj_scalar_t min_coordinate =
5079  local_global_min_max_coord_total_weight(kk);
5080  mj_scalar_t max_coordinate =
5081  local_global_min_max_coord_total_weight(
5082  kk + current_concurrent_num_parts);
5083  mj_scalar_t global_total_weight =
5084  local_global_min_max_coord_total_weight(
5085  kk + current_concurrent_num_parts * 2);
5086 
5087  Kokkos::parallel_for(Kokkos::TeamThreadRange (team_member, num_cuts),
5088  [=] (mj_part_t i) {
5089  // if left and right closest points are not set yet,
5090  // set it to the cut itself.
5091  if(min_coordinate -
5092  current_global_left_closest_points(i) > local_sEpsilon) {
5093  current_global_left_closest_points(i) =
5094  current_cut_coordinates(i);
5095  }
5096  if(current_global_right_closest_points(i) -
5097  max_coordinate > local_sEpsilon) {
5098  current_global_right_closest_points(i) =
5099  current_cut_coordinates(i);
5100  }
5101  });
5102  team_member.team_barrier(); // for end of Kokkos::TeamThreadRange
5103 
5104  Kokkos::parallel_for(Kokkos::TeamThreadRange (team_member, num_cuts),
5105  [=] (mj_part_t i) {
5106  using algMJ_t = AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
5107  mj_node_t>;
5108  // seen weight in the part
5109  mj_scalar_t seen_weight_in_part = 0;
5110  // expected weight for part.
5111  mj_scalar_t expected_weight_in_part = 0;
5112  // imbalance for the left and right side of the cut.
5113  double imbalance_on_left = 0, imbalance_on_right = 0;
5114  if(local_distribute_points_on_cut_lines) {
5115  // init the weight on the cut.
5116  local_global_rectilinear_cut_weight(i) = 0;
5117  local_process_rectilinear_cut_weight(i) = 0;
5118  }
5119  bool bContinue = false;
5120  // if already determined at previous iterations,
5121  // then just write the coordinate to new array, and proceed.
5122  if(current_cut_line_determined(i)) {
5123  new_current_cut_coordinates(i) =
5124  current_cut_coordinates(i);
5125  bContinue = true;
5126  }
5127  if(!bContinue) {
5128  //current weight of the part at the left of the cut line.
5129  seen_weight_in_part = current_global_part_weights(i * 2);
5130 
5131  //expected ratio
5132  expected_weight_in_part = current_part_target_weights(i);
5133 
5134  //leftImbalance = imbalanceOf(seenW, globalTotalWeight, expected);
5135  imbalance_on_left = algMJ_t::calculate_imbalance(seen_weight_in_part,
5136  expected_weight_in_part);
5137  // rightImbalance = imbalanceOf(globalTotalWeight - seenW,
5138  // globalTotalWeight, 1 - expected);
5139  imbalance_on_right = algMJ_t::calculate_imbalance(global_total_weight -
5140  seen_weight_in_part, global_total_weight - expected_weight_in_part);
5141  bool is_left_imbalance_valid = std::abs(imbalance_on_left) -
5142  used_imbalance_tolerance < local_sEpsilon ;
5143  bool is_right_imbalance_valid = std::abs(imbalance_on_right) -
5144  used_imbalance_tolerance < local_sEpsilon;
5145  //if the cut line reaches to desired imbalance.
5146  if(is_left_imbalance_valid && is_right_imbalance_valid) {
5147  current_cut_line_determined(i) = true;
5148  Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5149  new_current_cut_coordinates(i) = current_cut_coordinates(i);
5150  }
5151  else if(imbalance_on_left < 0) {
5152  //if left imbalance < 0 then we need to move the cut to right.
5153  if(local_distribute_points_on_cut_lines) {
5154  // if it is okay to distribute the coordinate on
5155  // the same coordinate to left and right.
5156  // then check if we can reach to the target weight by including the
5157  // coordinates in the part.
5158  if(current_global_part_weights(i * 2 + 1) ==
5159  expected_weight_in_part) {
5160  // if it is we are done.
5161  current_cut_line_determined(i) = true;
5162  Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5163 
5164  //then assign everything on the cut to the left of the cut.
5165  new_current_cut_coordinates(i) =
5166  current_cut_coordinates(i);
5167  //for this cut all the weight on cut will be put to left.
5168  current_part_cut_line_weight_to_put_left(i) =
5169  current_local_part_weights(i * 2 + 1) -
5170  current_local_part_weights(i * 2);
5171  bContinue = true;
5172  }
5173  else if(current_global_part_weights(i * 2 + 1) >
5174  expected_weight_in_part) {
5175  // if the weight is larger than the expected weight,
5176  // then we need to distribute some points to left, some to right.
5177  current_cut_line_determined(i) = true;
5178  Kokkos::atomic_add(&view_rectilinear_cut_count(0), 1);
5179 
5180  // increase the num cuts to be determined with rectilinear
5181  // partitioning.
5182  Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5183  new_current_cut_coordinates(i) =
5184  current_cut_coordinates(i);
5185  local_process_rectilinear_cut_weight[i] =
5186  current_local_part_weights(i * 2 + 1) -
5187  current_local_part_weights(i * 2);
5188  bContinue = true;
5189  }
5190  }
5191 
5192  if(!bContinue) {
5193 
5194  // we need to move further right,so set lower bound to current line,
5195  // and shift it to the closes point from right.
5196  current_cut_lower_bounds(i) =
5197  current_global_right_closest_points(i);
5198 
5199  //set the lower bound weight to the weight we have seen.
5200  current_cut_lower_bound_weights(i) = seen_weight_in_part;
5201 
5202  // compare the upper bound with what has been found in the
5203  // last iteration.
5204  // we try to make more strict bounds for the cut here.
5205  for(mj_part_t ii = i + 1; ii < num_cuts ; ++ii) {
5206  mj_scalar_t p_weight = current_global_part_weights(ii * 2);
5207  mj_scalar_t line_weight =
5208  current_global_part_weights(ii * 2 + 1);
5209  if(p_weight >= expected_weight_in_part) {
5210  // if a cut on the right has the expected weight, then we found
5211  // our cut position. Set up and low coordiantes to this
5212  // new cut coordinate, but we need one more iteration to
5213  // finalize the cut position, as wee need to update the part ids.
5214  if(p_weight == expected_weight_in_part) {
5215  current_cut_upper_bounds(i) =
5216  current_cut_coordinates(ii);
5217  current_cut_upper_weights(i) = p_weight;
5218  current_cut_lower_bounds(i) =
5219  current_cut_coordinates(ii);
5220  current_cut_lower_bound_weights(i) = p_weight;
5221  } else if(p_weight < current_cut_upper_weights(i)) {
5222  // if a part weight is larger then my expected weight,
5223  // but lower than my upper bound weight, update upper bound.
5224  current_cut_upper_bounds(i) =
5225  current_global_left_closest_points(ii);
5226  current_cut_upper_weights(i) = p_weight;
5227  }
5228  break;
5229  }
5230  // if comes here then pw < ew
5231  // then compare the weight against line weight.
5232  if(line_weight >= expected_weight_in_part) {
5233  // if the line is larger than the expected weight, then we need
5234  // to reach to the balance by distributing coordinates on
5235  // this line.
5236  current_cut_upper_bounds(i) =
5237  current_cut_coordinates(ii);
5238  current_cut_upper_weights(i) = line_weight;
5239  current_cut_lower_bounds(i) =
5240  current_cut_coordinates(ii);
5241  current_cut_lower_bound_weights(i) = p_weight;
5242  break;
5243  }
5244  // if a stricter lower bound is found,
5245  // update the lower bound.
5246  if(p_weight <= expected_weight_in_part && p_weight >=
5247  current_cut_lower_bound_weights(i)) {
5248  current_cut_lower_bounds(i) =
5249  current_global_right_closest_points(ii);
5250  current_cut_lower_bound_weights(i) = p_weight;
5251  }
5252  }
5253 
5254  mj_scalar_t new_cut_position = 0;
5255  algMJ_t::mj_calculate_new_cut_position(
5256  current_cut_upper_bounds(i),
5257  current_cut_lower_bounds(i),
5258  current_cut_upper_weights(i),
5259  current_cut_lower_bound_weights(i),
5260  expected_weight_in_part, new_cut_position,
5261  _sEpsilon);
5262 
5263  // if cut line does not move significantly.
5264  // then finalize the search.
5265  if(std::abs(current_cut_coordinates(i) -
5266  new_cut_position) < local_sEpsilon) {
5267  current_cut_line_determined(i) = true;
5268  Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5269 
5270  //set the cut coordinate and proceed.
5271  new_current_cut_coordinates(i) =
5272  current_cut_coordinates(i);
5273  } else {
5274  new_current_cut_coordinates(i) = new_cut_position;
5275  }
5276  } // bContinue
5277  } else {
5278  // need to move the cut line to left.
5279  // set upper bound to current line.
5280  current_cut_upper_bounds(i) =
5281  current_global_left_closest_points(i);
5282  current_cut_upper_weights(i) =
5283  seen_weight_in_part;
5284  // compare the current cut line weights with
5285  // previous upper and lower bounds.
5286  for(int ii = i - 1; ii >= 0; --ii) {
5287  mj_scalar_t p_weight =
5288  current_global_part_weights(ii * 2);
5289  mj_scalar_t line_weight =
5290  current_global_part_weights(ii * 2 + 1);
5291  if(p_weight <= expected_weight_in_part) {
5292  if(p_weight == expected_weight_in_part) {
5293  // if the weight of the part is my expected weight
5294  // then we find the solution.
5295  current_cut_upper_bounds(i) =
5296  current_cut_coordinates(ii);
5297  current_cut_upper_weights(i) = p_weight;
5298  current_cut_lower_bounds(i) =
5299  current_cut_coordinates(ii);
5300  current_cut_lower_bound_weights(i) = p_weight;
5301  }
5302  else if(p_weight > current_cut_lower_bound_weights(i)) {
5303  // if found weight is bigger than the lower bound
5304  // then update the lower bound.
5305  current_cut_lower_bounds(i) =
5306  current_global_right_closest_points(ii);
5307  current_cut_lower_bound_weights(i) = p_weight;
5308 
5309  // at the same time, if weight of line is bigger than the
5310  // expected weight, then update the upper bound as well.
5311  // in this case the balance will be obtained by distributing
5312  // weights on this cut position.
5313  if(line_weight > expected_weight_in_part) {
5314  current_cut_upper_bounds(i) =
5315  current_global_right_closest_points(ii);
5316  current_cut_upper_weights(i) = line_weight;
5317  }
5318  }
5319  break;
5320  }
5321  // if the weight of the cut on the left is still bigger than
5322  // my weight, and also if the weight is smaller than the current
5323  // upper weight, or if the weight is equal to current upper
5324  // weight, but on the left of the upper weight, then update
5325  // upper bound.
5326  if(p_weight >= expected_weight_in_part &&
5327  (p_weight < current_cut_upper_weights(i) ||
5328  (p_weight == current_cut_upper_weights(i) &&
5329  current_cut_upper_bounds(i) >
5330  current_global_left_closest_points(ii)))) {
5331  current_cut_upper_bounds(i) =
5332  current_global_left_closest_points(ii);
5333  current_cut_upper_weights(i) = p_weight;
5334  }
5335  }
5336  mj_scalar_t new_cut_position = 0;
5337  algMJ_t::mj_calculate_new_cut_position(
5338  current_cut_upper_bounds(i),
5339  current_cut_lower_bounds(i),
5340  current_cut_upper_weights(i),
5341  current_cut_lower_bound_weights(i),
5342  expected_weight_in_part,
5343  new_cut_position,
5344  _sEpsilon);
5345 
5346  // if cut line does not move significantly.
5347  if(std::abs(current_cut_coordinates(i) -
5348  new_cut_position) < local_sEpsilon) {
5349  current_cut_line_determined(i) = true;
5350  Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5351  //set the cut coordinate and proceed.
5352  new_current_cut_coordinates(i) =
5353  current_cut_coordinates(i);
5354  } else {
5355  new_current_cut_coordinates(i) =
5356  new_cut_position;
5357  }
5358  }
5359  }; // bContinue
5360  });
5361 
5362  team_member.team_barrier(); // for end of Kokkos::TeamThreadRange
5363  });
5364 
5365  // view_rectilinear_cut_count
5366  mj_part_t rectilinear_cut_count;
5367  Kokkos::parallel_reduce("Read bDoingWork",
5368  Kokkos::RangePolicy<typename mj_node_t::execution_space, int>(0, 1),
5369  KOKKOS_LAMBDA(int dummy, int & set_single) {
5370  set_single = view_rectilinear_cut_count(0);
5371  }, rectilinear_cut_count);
5372 
5373  if(rectilinear_cut_count > 0) {
5374  auto host_local_process_rectilinear_cut_weight =
5375  Kokkos::create_mirror_view(Kokkos::HostSpace(),
5376  local_process_rectilinear_cut_weight);
5377  auto host_local_global_rectilinear_cut_weight =
5378  Kokkos::create_mirror_view(Kokkos::HostSpace(),
5379  local_global_rectilinear_cut_weight);
5380  Kokkos::deep_copy(host_local_process_rectilinear_cut_weight,
5381  local_process_rectilinear_cut_weight);
5382  Kokkos::deep_copy(host_local_global_rectilinear_cut_weight,
5383  local_global_rectilinear_cut_weight);
5384  Teuchos::scan<int,mj_scalar_t>(
5385  *comm, Teuchos::REDUCE_SUM,
5386  num_cuts,
5387  host_local_process_rectilinear_cut_weight.data(),
5388  host_local_global_rectilinear_cut_weight.data());
5389  Kokkos::deep_copy(local_process_rectilinear_cut_weight,
5390  host_local_process_rectilinear_cut_weight);
5391  Kokkos::deep_copy(local_global_rectilinear_cut_weight,
5392  host_local_global_rectilinear_cut_weight);
5393 
5394  Kokkos::parallel_for("finish up mj_get_new_cut_coordinates",
5395  Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
5396  KOKKOS_LAMBDA(int dummy) {
5397  for(mj_part_t i = 0; i < num_cuts; ++i) {
5398  // if cut line weight to be distributed.
5399  if(local_global_rectilinear_cut_weight(i) > 0) {
5400  // expected weight to go to left of the cut.
5401  mj_scalar_t expected_part_weight = current_part_target_weights(i);
5402  // the weight that should be put to left of the cut.
5403  mj_scalar_t necessary_weight_on_line_for_left =
5404  expected_part_weight - current_global_part_weights(i * 2);
5405 
5406  // the weight of the cut in the process
5407  mj_scalar_t my_weight_on_line =
5408  local_process_rectilinear_cut_weight(i);
5409 
5410  // the sum of the cut weights upto this process,
5411  // including the weight of this process.
5412  mj_scalar_t weight_on_line_upto_process_inclusive =
5413  local_global_rectilinear_cut_weight(i);
5414  // the space on the left side of the cut after all processes
5415  // before this process (including this process)
5416  // puts their weights on cut to left.
5417  mj_scalar_t space_to_put_left =
5418  necessary_weight_on_line_for_left -
5419  weight_on_line_upto_process_inclusive;
5420  // add my weight to this space to find out how much space
5421  // is left to me.
5422  mj_scalar_t space_left_to_me =
5423  space_to_put_left + my_weight_on_line;
5424 
5425  /*
5426  cout << "expected_part_weight:" << expected_part_weight
5427  << " necessary_weight_on_line_for_left:"
5428  << necessary_weight_on_line_for_left
5429  << " my_weight_on_line" << my_weight_on_line
5430  << " weight_on_line_upto_process_inclusive:"
5431  << weight_on_line_upto_process_inclusive
5432  << " space_to_put_left:" << space_to_put_left
5433  << " space_left_to_me" << space_left_to_me << endl;
5434  */
5435 
5436  if(space_left_to_me < 0) {
5437  // space_left_to_me is negative and i dont need to put
5438  // anything to left.
5439  current_part_cut_line_weight_to_put_left(i) = 0;
5440  }
5441  else if(space_left_to_me >= my_weight_on_line) {
5442  // space left to me is bigger than the weight of the
5443  // processor on cut.
5444  // so put everything to left.
5445  current_part_cut_line_weight_to_put_left(i) =
5446  my_weight_on_line;
5447  // cout << "setting current_part_cut_line_weight_to_put_left
5448  // to my_weight_on_line:" << my_weight_on_line << endl;
5449  }
5450  else {
5451  // put only the weight as much as the space.
5452  current_part_cut_line_weight_to_put_left(i) =
5453  space_left_to_me;
5454  // cout << "setting current_part_cut_line_weight_to_put_left
5455  // to space_left_to_me:" << space_left_to_me << endl;
5456  }
5457  }
5458  }
5459  view_rectilinear_cut_count(0) = 0;
5460  });
5461  }
5462 
5463  Kokkos::deep_copy(this->incomplete_cut_count, device_incomplete_cut_count);
5464 }
5465 
5475 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5476  typename mj_part_t, typename mj_node_t>
5477 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5478  get_processor_num_points_in_parts(
5479  mj_part_t num_procs,
5480  mj_part_t num_parts,
5481  mj_gno_t *&num_points_in_all_processor_parts)
5482 {
5483  // initially allocation_size is num_parts
5484  size_t allocation_size = num_parts * (num_procs + 1);
5485 
5486  // this will be output
5487  // holds how many each processor has in each part.
5488  // last portion is the sum of all processor points in each part.
5489 
5490  // allocate memory for the local num coordinates in each part.
5491  mj_gno_t *num_local_points_in_each_part_to_reduce_sum =
5492  new mj_gno_t[allocation_size];
5493 
5494  // this is the portion of the memory which will be used
5495  // at the summation to obtain total number of processors' points in each part.
5496  mj_gno_t *my_local_points_to_reduce_sum =
5497  num_local_points_in_each_part_to_reduce_sum + num_procs * num_parts;
5498 
5499  // this is the portion of the memory where each stores its local number.
5500  // this information is needed by other processors.
5501  mj_gno_t *my_local_point_counts_in_each_part =
5502  num_local_points_in_each_part_to_reduce_sum + this->myRank * num_parts;
5503 
5504  // initialize the array with 0's.
5505  memset(num_local_points_in_each_part_to_reduce_sum, 0,
5506  sizeof(mj_gno_t)*allocation_size);
5507 
5508  auto local_new_part_xadj = this->new_part_xadj;
5509  Kokkos::View<mj_gno_t *, typename mj_node_t::device_type> points_per_part(
5510  Kokkos::ViewAllocateWithoutInitializing("points per part"), num_parts);
5511  Kokkos::parallel_for("get vals on device",
5512  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_gno_t>
5513  (0, num_parts), KOKKOS_LAMBDA(mj_gno_t i) {
5514  points_per_part(i) =
5515  local_new_part_xadj(i) - ((i == 0) ? 0 : local_new_part_xadj(i-1));
5516  });
5517  auto host_points_per_part = Kokkos::create_mirror_view(points_per_part);
5518  Kokkos::deep_copy(host_points_per_part, points_per_part);
5519  for(int i = 0; i < num_parts; ++i) {
5520  my_local_points_to_reduce_sum[i] = host_points_per_part(i);
5521  }
5522 
5523  // copy the local num parts to the last portion of array, so that this portion
5524  // will represent the global num points in each part after the reduction.
5525  memcpy (my_local_point_counts_in_each_part, my_local_points_to_reduce_sum,
5526  sizeof(mj_gno_t) * (num_parts) );
5527 
5528  // reduceAll operation.
5529  // the portion that belongs to a processor with index p
5530  // will start from myRank * num_parts.
5531  // the global number of points will be held at the index
5532  try{
5533  reduceAll<int, mj_gno_t>(
5534  *(this->comm),
5535  Teuchos::REDUCE_SUM,
5536  allocation_size,
5537  num_local_points_in_each_part_to_reduce_sum,
5538  num_points_in_all_processor_parts);
5539  }
5540  Z2_THROW_OUTSIDE_ERROR(*(this->mj_env))
5541 
5542  delete [] num_local_points_in_each_part_to_reduce_sum;
5543 }
5544 
5560 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5561  typename mj_part_t, typename mj_node_t>
5562 bool AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5563  mj_check_to_migrate(
5564  size_t migration_reduce_all_population,
5565  mj_lno_t num_coords_for_last_dim_part,
5566  mj_part_t num_procs,
5567  mj_part_t num_parts,
5568  mj_gno_t *num_points_in_all_processor_parts)
5569 {
5570  // if reduce all count and population in the last dim is too high
5571  if(migration_reduce_all_population > future_reduceall_cutoff) {
5572  return true;
5573  }
5574 
5575  // if the work in a part per processor in the last dim is too low.
5576  if(num_coords_for_last_dim_part < min_work_last_dim) {
5577  return true;
5578  }
5579 
5580  // if migration is to be checked and the imbalance is too high
5581  if(this->check_migrate_avoid_migration_option == 0) {
5582  double global_imbalance = 0;
5583  // global shift to reach the sum of coordiante count in each part.
5584  size_t global_shift = num_procs * num_parts;
5585 
5586  for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5587  for(mj_part_t i = 0; i < num_parts; ++i) {
5588  double ideal_num = num_points_in_all_processor_parts[global_shift + i]
5589  / double(num_procs);
5590 
5591  global_imbalance += std::abs(ideal_num -
5592  num_points_in_all_processor_parts[ii * num_parts + i]) / (ideal_num);
5593  }
5594  }
5595  global_imbalance /= num_parts;
5596  global_imbalance /= num_procs;
5597 
5598  if(global_imbalance <= this->minimum_migration_imbalance) {
5599  return false;
5600  }
5601  else {
5602  return true;
5603  }
5604  }
5605  else {
5606  // if migration is forced
5607  return true;
5608  }
5609 }
5610 
5624 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5625  typename mj_part_t, typename mj_node_t>
5626 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5627  assign_send_destinations(
5628  mj_part_t num_parts,
5629  mj_part_t *part_assignment_proc_begin_indices,
5630  mj_part_t *processor_chains_in_parts,
5631  mj_lno_t *send_count_to_each_proc,
5632  int *coordinate_destinations) {
5633 
5634  auto host_new_part_xadj = Kokkos::create_mirror_view(this->new_part_xadj);
5635  deep_copy(host_new_part_xadj, this->new_part_xadj);
5636 
5637  auto host_new_coordinate_permutations =
5638  Kokkos::create_mirror_view(this->new_coordinate_permutations);
5639  deep_copy(host_new_coordinate_permutations, this->new_coordinate_permutations);
5640 
5641  for(mj_part_t p = 0; p < num_parts; ++p) {
5642  mj_lno_t part_begin = 0;
5643  if(p > 0) part_begin = host_new_part_xadj(p - 1);
5644  mj_lno_t part_end = host_new_part_xadj(p);
5645  // get the first part that current processor will send its part-p.
5646  mj_part_t proc_to_sent = part_assignment_proc_begin_indices[p];
5647  // initialize how many point I sent to this processor.
5648  mj_lno_t num_total_send = 0;
5649  for(mj_lno_t j=part_begin; j < part_end; j++) {
5650  mj_lno_t local_ind = host_new_coordinate_permutations(j);
5651  while (num_total_send >= send_count_to_each_proc[proc_to_sent]) {
5652  // then get the next processor to send the points in part p.
5653  num_total_send = 0;
5654  // assign new processor to part_assign_begin[p]
5655  part_assignment_proc_begin_indices[p] =
5656  processor_chains_in_parts[proc_to_sent];
5657  // remove the previous processor
5658  processor_chains_in_parts[proc_to_sent] = -1;
5659  // choose the next processor as the next one to send.
5660  proc_to_sent = part_assignment_proc_begin_indices[p];
5661  }
5662  // write the gno index to corresponding position in sendBuf.
5663  coordinate_destinations[local_ind] = proc_to_sent;
5664  ++num_total_send;
5665  }
5666  }
5667 }
5668 
5689 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5690  typename mj_part_t, typename mj_node_t>
5691 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5692  mj_assign_proc_to_parts(
5693  mj_gno_t * num_points_in_all_processor_parts,
5694  mj_part_t num_parts,
5695  mj_part_t num_procs,
5696  mj_lno_t *send_count_to_each_proc,
5697  std::vector<mj_part_t> &processor_ranks_for_subcomm,
5698  std::vector<mj_part_t> *next_future_num_parts_in_parts,
5699  mj_part_t &out_part_index,
5700  mj_part_t &output_part_numbering_begin_index,
5701  int * coordinate_destinations) {
5702  mj_gno_t *global_num_points_in_parts =
5703  num_points_in_all_processor_parts + num_procs * num_parts;
5704  mj_part_t *num_procs_assigned_to_each_part = new mj_part_t[num_parts];
5705 
5706  // boolean variable if the process finds its part to be assigned.
5707  bool did_i_find_my_group = false;
5708 
5709  mj_part_t num_free_procs = num_procs;
5710  mj_part_t minimum_num_procs_required_for_rest_of_parts = num_parts - 1;
5711 
5712  double max_imbalance_difference = 0;
5713  mj_part_t max_differing_part = 0;
5714 
5715  // find how many processor each part requires.
5716  for(mj_part_t i = 0; i < num_parts; i++) {
5717 
5718  // scalar portion of the required processors
5719  double scalar_required_proc = num_procs *
5720  (double (global_num_points_in_parts[i]) /
5721  double (this->num_global_coords));
5722 
5723  // round it to closest integer; make sure have at least one proc.
5724  mj_part_t required_proc =
5725  static_cast<mj_part_t> (0.5 + scalar_required_proc);
5726  if(required_proc == 0) required_proc = 1;
5727 
5728  // if assigning the required num procs, creates problems for the rest
5729  // of the parts, then only assign {num_free_procs -
5730  // (minimum_num_procs_required_for_rest_of_parts)} procs to this part.
5731  if(num_free_procs -
5732  required_proc < minimum_num_procs_required_for_rest_of_parts) {
5733  required_proc = num_free_procs -
5734  (minimum_num_procs_required_for_rest_of_parts);
5735  }
5736 
5737  // reduce the free processor count
5738  num_free_procs -= required_proc;
5739 
5740  // reduce the free minimum processor count required for the rest of the
5741  // part by 1.
5742  --minimum_num_procs_required_for_rest_of_parts;
5743 
5744  // part (i) is assigned to (required_proc) processors.
5745  num_procs_assigned_to_each_part[i] = required_proc;
5746 
5747  // because of the roundings some processors might be left as unassigned.
5748  // we want to assign those processors to the part with most imbalance.
5749  // find the part with the maximum imbalance here.
5750  double imbalance_wrt_ideal =
5751  (scalar_required_proc - required_proc) / required_proc;
5752  if(imbalance_wrt_ideal > max_imbalance_difference) {
5753  max_imbalance_difference = imbalance_wrt_ideal;
5754  max_differing_part = i;
5755  }
5756  }
5757 
5758  // assign extra processors to the part with maximum imbalance
5759  // than the ideal.
5760  if(num_free_procs > 0) {
5761  num_procs_assigned_to_each_part[max_differing_part] += num_free_procs;
5762  }
5763 
5764  // now find what are the best processors with least migration for each part.
5765 
5766  // part_assignment_proc_begin_indices ([i]) is the array that holds the
5767  // beginning index of a processor that processor sends its data for part - i
5768  mj_part_t *part_assignment_proc_begin_indices = new mj_part_t[num_parts];
5769 
5770  // the next processor send is found in processor_chains_in_parts,
5771  // in linked list manner.
5772  mj_part_t *processor_chains_in_parts = new mj_part_t [num_procs];
5773  mj_part_t *processor_part_assignments = new mj_part_t[num_procs];
5774 
5775  // initialize the assignment of each processor.
5776  // this has a linked list implementation.
5777  // the beginning of processors assigned
5778  // to each part is hold at part_assignment_proc_begin_indices[part].
5779  // then the next processor assigned to that part is located at
5780  // proc_part_assignments[part_assign_begins[part]], this is a chain
5781  // until the value of -1 is reached.
5782  for(int i = 0; i < num_procs; ++i ) {
5783  processor_part_assignments[i] = -1;
5784  processor_chains_in_parts[i] = -1;
5785  }
5786  for(int i = 0; i < num_parts; ++i ) {
5787  part_assignment_proc_begin_indices[i] = -1;
5788  }
5789 
5790  // std::cout << "Before migration: mig type:" <<
5791  // this->migration_type << std::endl;
5792  // Allocate memory for sorting data structure.
5793  uSignedSortItem<mj_part_t, mj_gno_t, char> *
5794  sort_item_num_part_points_in_procs =
5795  new uSignedSortItem<mj_part_t, mj_gno_t, char>[num_procs];
5796 
5797  for(mj_part_t i = 0; i < num_parts; ++i) {
5798  // the algorithm tries to minimize the cost of migration, by assigning the
5799  // processors with highest number of coordinates on that part.
5800  // here we might want to implement a maximum weighted bipartite matching
5801  // algorithm.
5802  for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5803  sort_item_num_part_points_in_procs[ii].id = ii;
5804  // if processor is not assigned yet.
5805  // add its num points to the sort data structure.
5806  if(processor_part_assignments[ii] == -1) {
5807  sort_item_num_part_points_in_procs[ii].val =
5808  num_points_in_all_processor_parts[ii * num_parts + i];
5809  // indicate that the processor has positive weight.
5810  sort_item_num_part_points_in_procs[ii].signbit = 1;
5811  }
5812  else {
5813  // if processor is already assigned, insert -nLocal - 1 so that it
5814  // won't be selected again.
5815  // would be same if we simply set it to -1, but more information with
5816  // no extra cost (which is used later) is provided.
5817  // sort_item_num_part_points_in_procs[ii].val =
5818  // -num_points_in_all_processor_parts[ii * num_parts + i] - 1;
5819 
5820  // UPDATE: Since above gets warning when unsigned is used to
5821  // represent, we added extra bit to as sign bit to the sort item.
5822  // It is 1 for positives, 0 for negatives.
5823  sort_item_num_part_points_in_procs[ii].val =
5824  num_points_in_all_processor_parts[ii * num_parts + i];
5825  sort_item_num_part_points_in_procs[ii].signbit = 0;
5826  }
5827  }
5828 
5829  // sort the processors in the part.
5830  uqSignsort<mj_part_t, mj_gno_t,char>
5831  (num_procs, sort_item_num_part_points_in_procs);
5832 
5833  /*
5834  for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5835  std::cout << "ii:" << ii << " " <<
5836  sort_item_num_part_points_in_procs[ii].id <<
5837  " " << sort_item_num_part_points_in_procs[ii].val <<
5838  " " << int(sort_item_num_part_points_in_procs[ii].signbit) <<
5839  std::endl;
5840  }
5841  */
5842 
5843  mj_part_t required_proc_count = num_procs_assigned_to_each_part[i];
5844  mj_gno_t total_num_points_in_part = global_num_points_in_parts[i];
5845  mj_gno_t ideal_num_points_in_a_proc = Teuchos::as<mj_gno_t>(
5846  ceil(total_num_points_in_part / double (required_proc_count)));
5847 
5848  // starts sending to least heaviest part.
5849  mj_part_t next_proc_to_send_index = num_procs - required_proc_count;
5850  mj_part_t next_proc_to_send_id =
5851  sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
5852  mj_lno_t space_left_in_sent_proc = ideal_num_points_in_a_proc -
5853  sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
5854 
5855  // find the processors that will be assigned to this part, which are the
5856  // heaviest non assigned processors.
5857  for(mj_part_t ii = num_procs - 1;
5858  ii >= num_procs - required_proc_count; --ii) {
5859  mj_part_t proc_id = sort_item_num_part_points_in_procs[ii].id;
5860  // assign processor to part - i.
5861  processor_part_assignments[proc_id] = i;
5862  }
5863 
5864  bool did_change_sign = false;
5865  // if processor has a minus count, reverse it.
5866  for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5867  // TODO: THE LINE BELOW PRODUCES A WARNING IF gno_t IS UNSIGNED
5868  // TODO: SEE BUG 6194
5869  if(sort_item_num_part_points_in_procs[ii].signbit == 0) {
5870  did_change_sign = true;
5871  sort_item_num_part_points_in_procs[ii].signbit = 1;
5872  }
5873  else {
5874  break;
5875  }
5876  }
5877 
5878  if(did_change_sign) {
5879  // resort the processors in the part for the rest of the processors that
5880  // is not assigned.
5881  uqSignsort<mj_part_t, mj_gno_t>(num_procs - required_proc_count,
5882  sort_item_num_part_points_in_procs);
5883  }
5884 
5885  /*
5886  for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5887  std::cout << "after resort ii:" << ii << " " <<
5888  sort_item_num_part_points_in_procs[ii].id <<
5889  " " << sort_item_num_part_points_in_procs[ii].val <<
5890  " " << int(sort_item_num_part_points_in_procs[ii].signbit ) <<
5891  std::endl;
5892  }
5893  */
5894 
5895  // check if this processors is one of the procs assigned to this part.
5896  // if it is, then get the group.
5897  if(!did_i_find_my_group) {
5898  for(mj_part_t ii = num_procs - 1; ii >=
5899  num_procs - required_proc_count; --ii) {
5900 
5901  mj_part_t proc_id_to_assign = sort_item_num_part_points_in_procs[ii].id;
5902 
5903  // add the proc to the group.
5904  processor_ranks_for_subcomm.push_back(proc_id_to_assign);
5905 
5906  if(proc_id_to_assign == this->myRank) {
5907  // if the assigned process is me, then I find my group.
5908  did_i_find_my_group = true;
5909 
5910  // set the beginning of part i to my rank.
5911  part_assignment_proc_begin_indices[i] = this->myRank;
5912  processor_chains_in_parts[this->myRank] = -1;
5913 
5914  // set send count to myself to the number of points that I have
5915  // in part i.
5916  send_count_to_each_proc[this->myRank] =
5917  sort_item_num_part_points_in_procs[ii].val;
5918 
5919  // calculate the shift required for the
5920  // output_part_numbering_begin_index
5921  for(mj_part_t in = 0; in < i; ++in) {
5922  output_part_numbering_begin_index +=
5923  (*next_future_num_parts_in_parts)[in];
5924  }
5925  out_part_index = i;
5926  }
5927  }
5928 
5929  // if these was not my group,
5930  // clear the subcomminicator processor array.
5931  if(!did_i_find_my_group) {
5932  processor_ranks_for_subcomm.clear();
5933  }
5934  }
5935 
5936  // send points of the nonassigned coordinates to the assigned coordinates.
5937  // starts from the heaviest nonassigned processor.
5938  // TODO we might want to play with this part, that allows more
5939  // computational imbalance but having better communication balance.
5940  for(mj_part_t ii = num_procs - required_proc_count - 1; ii >= 0; --ii) {
5941  mj_part_t nonassigned_proc_id =
5942  sort_item_num_part_points_in_procs[ii].id;
5943  mj_lno_t num_points_to_sent =
5944  sort_item_num_part_points_in_procs[ii].val;
5945 
5946  // we set number of points to -to_sent - 1 for the assigned processors.
5947  // we reverse it here. This should not happen, as we have already
5948  // reversed them above.
5949 #ifdef MJ_DEBUG
5950  if(num_points_to_sent < 0) {
5951  cout << "Migration - processor assignments - for part:" << i
5952  << "from proc:" << nonassigned_proc_id << " num_points_to_sent:"
5953  << num_points_to_sent << std::endl;
5954  std::terminate();
5955  }
5956 #endif
5957 
5958  switch (migration_type) {
5959  case 0:
5960  {
5961  // now sends the points to the assigned processors.
5962  while (num_points_to_sent > 0) {
5963  // if the processor has enough space.
5964  if(num_points_to_sent <= space_left_in_sent_proc) {
5965  // reduce the space left in the processor.
5966  space_left_in_sent_proc -= num_points_to_sent;
5967  // if my rank is the one that is sending the coordinates.
5968  if(this->myRank == nonassigned_proc_id) {
5969  // set my sent count to the sent processor.
5970  send_count_to_each_proc[next_proc_to_send_id] =
5971  num_points_to_sent;
5972  // save the processor in the list (processor_chains_in_parts
5973  // and part_assignment_proc_begin_indices)
5974  // that the processor will send its point in part-i.
5975  mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
5976  part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
5977  processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
5978  }
5979  num_points_to_sent = 0;
5980  }
5981  else {
5982  // there might be no space left in the processor.
5983  if(space_left_in_sent_proc > 0) {
5984  num_points_to_sent -= space_left_in_sent_proc;
5985 
5986  //send as the space left in the processor.
5987  if(this->myRank == nonassigned_proc_id) {
5988  // send as much as the space in this case.
5989  send_count_to_each_proc[next_proc_to_send_id] =
5990  space_left_in_sent_proc;
5991  mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
5992  part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
5993  processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
5994  }
5995  }
5996  // change the sent part
5997  ++next_proc_to_send_index;
5998 
5999 #ifdef MJ_DEBUG
6000  if(next_part_to_send_index < nprocs - required_proc_count ) {
6001  cout << "Migration - processor assignments - for part:"
6002  << i
6003  << " next_part_to_send :" << next_part_to_send_index
6004  << " nprocs:" << nprocs
6005  << " required_proc_count:" << required_proc_count
6006  << " Error: next_part_to_send_index <" <<
6007  << " nprocs - required_proc_count" << std::endl;
6008  std::terminate();
6009  }
6010 #endif
6011  // send the new id.
6012  next_proc_to_send_id =
6013  sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
6014  // set the new space in the processor.
6015  space_left_in_sent_proc = ideal_num_points_in_a_proc -
6016  sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
6017  }
6018  }
6019  }
6020  break;
6021  default:
6022  {
6023  // to minimize messages, we want each processor to send its
6024  // coordinates to only a single point.
6025  // we do not respect imbalances here, we send all points to the
6026  // next processor.
6027  if(this->myRank == nonassigned_proc_id) {
6028  // set my sent count to the sent processor.
6029  send_count_to_each_proc[next_proc_to_send_id] = num_points_to_sent;
6030  // save the processor in the list (processor_chains_in_parts and
6031  // part_assignment_proc_begin_indices)
6032  // that the processor will send its point in part-i.
6033  mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
6034  part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
6035  processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
6036  }
6037  num_points_to_sent = 0;
6038  ++next_proc_to_send_index;
6039 
6040  // if we made it to the heaviest processor we round robin and
6041  // go to beginning
6042  if(next_proc_to_send_index == num_procs) {
6043  next_proc_to_send_index = num_procs - required_proc_count;
6044  }
6045  // send the new id.
6046  next_proc_to_send_id =
6047  sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
6048  // set the new space in the processor.
6049  space_left_in_sent_proc = ideal_num_points_in_a_proc -
6050  sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
6051  }
6052  }
6053  }
6054  }
6055 
6056  /*
6057  for(int i = 0; i < num_procs;++i) {
6058  std::cout << "me:" << this->myRank << " to part:" << i << " sends:" <<
6059  send_count_to_each_proc[i] << std::endl;
6060  }
6061  */
6062 
6063  this->assign_send_destinations(
6064  num_parts,
6065  part_assignment_proc_begin_indices,
6066  processor_chains_in_parts,
6067  send_count_to_each_proc,
6068  coordinate_destinations);
6069  delete [] part_assignment_proc_begin_indices;
6070  delete [] processor_chains_in_parts;
6071  delete [] processor_part_assignments;
6072  delete [] sort_item_num_part_points_in_procs;
6073  delete [] num_procs_assigned_to_each_part;
6074 }
6075 
6091 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6092  typename mj_part_t, typename mj_node_t>
6093 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6094  assign_send_destinations2(
6095  mj_part_t num_parts,
6096  uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment,
6097  int *coordinate_destinations,
6098  mj_part_t &output_part_numbering_begin_index,
6099  std::vector<mj_part_t> *next_future_num_parts_in_parts)
6100 {
6101  mj_part_t part_shift_amount = output_part_numbering_begin_index;
6102  mj_part_t previous_processor = -1;
6103 
6104  auto local_new_part_xadj = Kokkos::create_mirror_view(this->new_part_xadj);
6105  Kokkos::deep_copy(local_new_part_xadj, this->new_part_xadj);
6106 
6107  auto local_new_coordinate_permutations =
6108  Kokkos::create_mirror_view(this->new_coordinate_permutations);
6109  Kokkos::deep_copy(local_new_coordinate_permutations,
6110  this->new_coordinate_permutations);
6111 
6112  for(mj_part_t i = 0; i < num_parts; ++i) {
6113  mj_part_t p = sort_item_part_to_proc_assignment[i].id;
6114 
6115  // assigned processors are sorted.
6116  mj_lno_t part_begin_index = 0;
6117 
6118  if(p > 0) {
6119  part_begin_index = local_new_part_xadj(p - 1);
6120  }
6121 
6122  mj_lno_t part_end_index = local_new_part_xadj(p);
6123 
6124  mj_part_t assigned_proc = sort_item_part_to_proc_assignment[i].val;
6125  if(this->myRank == assigned_proc && previous_processor != assigned_proc) {
6126  output_part_numbering_begin_index = part_shift_amount;
6127  }
6128  previous_processor = assigned_proc;
6129  part_shift_amount += (*next_future_num_parts_in_parts)[p];
6130 
6131  for(mj_lno_t j= part_begin_index; j < part_end_index; j++) {
6132  mj_lno_t localInd = local_new_coordinate_permutations(j);
6133  coordinate_destinations[localInd] = assigned_proc;
6134  }
6135  }
6136 }
6137 
6159 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6160  typename mj_part_t, typename mj_node_t>
6161 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6162  mj_assign_parts_to_procs(
6163  mj_gno_t * num_points_in_all_processor_parts,
6164  mj_part_t num_parts,
6165  mj_part_t num_procs,
6166  mj_lno_t *send_count_to_each_proc,
6167  std::vector<mj_part_t> *next_future_num_parts_in_parts,
6168  mj_part_t &out_num_part,
6169  std::vector<mj_part_t> &out_part_indices,
6170  mj_part_t &output_part_numbering_begin_index,
6171  int *coordinate_destinations) {
6172 
6173  out_num_part = 0;
6174  mj_gno_t *global_num_points_in_parts =
6175  num_points_in_all_processor_parts + num_procs * num_parts;
6176  out_part_indices.clear();
6177 
6178  // to sort the parts that is assigned to the processors.
6179  // id is the part number, sort value is the assigned processor id.
6180  uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment =
6181  new uSortItem<mj_part_t, mj_part_t>[num_parts];
6182  uSortItem<mj_part_t, mj_gno_t> * sort_item_num_points_of_proc_in_part_i =
6183  new uSortItem<mj_part_t, mj_gno_t>[num_procs];
6184 
6185  // calculate the optimal number of coordinates that should be assigned
6186  // to each processor.
6187  mj_lno_t work_each =
6188  mj_lno_t (this->num_global_coords / (double (num_procs)) + 0.5f);
6189 
6190  // to hold the left space as the number of coordinates to the optimal
6191  // number in each proc.
6192  mj_lno_t *space_in_each_processor = new mj_lno_t[num_procs];
6193 
6194  // initialize left space in each.
6195  for(mj_part_t i = 0; i < num_procs; ++i) {
6196  space_in_each_processor[i] = work_each;
6197  }
6198 
6199  // we keep track of how many parts each processor is assigned to.
6200  // because in some weird inputs, it might be possible that some
6201  // processors is not assigned to any part. Using these variables,
6202  // we force each processor to have at least one part.
6203  mj_part_t *num_parts_proc_assigned = new mj_part_t[num_procs];
6204  memset(num_parts_proc_assigned, 0, sizeof(mj_part_t) * num_procs);
6205  int empty_proc_count = num_procs;
6206 
6207  // to sort the parts with decreasing order of their coordiantes.
6208  // id are the part numbers, sort value is the number of points in each.
6209  uSortItem<mj_part_t, mj_gno_t> * sort_item_point_counts_in_parts =
6210  new uSortItem<mj_part_t, mj_gno_t>[num_parts];
6211 
6212  // initially we will sort the parts according to the number of coordinates
6213  // they have, so that we will start assigning with the part that has the most
6214  // number of coordinates.
6215  for(mj_part_t i = 0; i < num_parts; ++i) {
6216  sort_item_point_counts_in_parts[i].id = i;
6217  sort_item_point_counts_in_parts[i].val = global_num_points_in_parts[i];
6218  }
6219 
6220  // sort parts with increasing order of loads.
6221  uqsort<mj_part_t, mj_gno_t>(num_parts, sort_item_point_counts_in_parts);
6222 
6223  // assigning parts to the processors
6224  // traverse the part with decreasing order of load.
6225  // first assign the heaviest part.
6226  for(mj_part_t j = 0; j < num_parts; ++j) {
6227  // sorted with increasing order, traverse inverse.
6228  mj_part_t i = sort_item_point_counts_in_parts[num_parts - 1 - j].id;
6229 
6230  // load of the part
6231  mj_gno_t load = global_num_points_in_parts[i];
6232 
6233  // assigned processors
6234  mj_part_t assigned_proc = -1;
6235 
6236  // sort processors with increasing number of points in this part.
6237  for(mj_part_t ii = 0; ii < num_procs; ++ii) {
6238  sort_item_num_points_of_proc_in_part_i[ii].id = ii;
6239 
6240  // if there are still enough parts to fill empty processors, than proceed
6241  // normally, but if empty processor count is equal to the number of part,
6242  // then we force to part assignments only to empty processors.
6243  if(empty_proc_count < num_parts - j ||
6244  num_parts_proc_assigned[ii] == 0) {
6245  // how many points processor ii has in part i?
6246  sort_item_num_points_of_proc_in_part_i[ii].val =
6247  num_points_in_all_processor_parts[ii * num_parts + i];
6248  }
6249  else {
6250  sort_item_num_points_of_proc_in_part_i[ii].val = -1;
6251  }
6252  }
6253 
6254  uqsort<mj_part_t, mj_gno_t>(num_procs,
6255  sort_item_num_points_of_proc_in_part_i);
6256 
6257  // traverse all processors with decreasing load.
6258  for(mj_part_t iii = num_procs - 1; iii >= 0; --iii) {
6259  mj_part_t ii = sort_item_num_points_of_proc_in_part_i[iii].id;
6260  if(assigned_proc == -1 ||
6261  (space_in_each_processor[ii] > space_in_each_processor[assigned_proc])) {
6262  assigned_proc = ii;
6263  }
6264  else if(space_in_each_processor[ii] == space_in_each_processor[assigned_proc]) {
6265  if(ii < assigned_proc) {
6266  // ties go to lower proc
6267  // not necessary for a valid result but allows testing to compare
6268  // MPI results and have parts numbers assigned to the same boxes.
6269  // We don't break here because we may have more ties still to check.
6270  // The indeterminate state before this is due to Cuda using
6271  // atomics to refill the permutation array. So non-cuda runs don't
6272  // actualy need this since they will always have the same pattern.
6273  assigned_proc = ii;
6274  }
6275  }
6276  else {
6277  break; // now we can break - we have our part and no more ties.
6278  }
6279  }
6280 
6281  if(num_parts_proc_assigned[assigned_proc]++ == 0) {
6282  --empty_proc_count;
6283  }
6284 
6285  space_in_each_processor[assigned_proc] -= load;
6286  //to sort later, part-i is assigned to the proccessor - assignment.
6287  sort_item_part_to_proc_assignment[j].id = i; //part i
6288 
6289  // assigned to processor - assignment.
6290  sort_item_part_to_proc_assignment[j].val = assigned_proc;
6291 
6292  // if assigned processor is me, increase the number.
6293  if(assigned_proc == this->myRank) {
6294  out_num_part++;//assigned_part_count;
6295  out_part_indices.push_back(i);
6296  }
6297 
6298  // increase the send to that processor by the number of points in that
6299  // part, as everyone send their coordiantes in this part to the
6300  // processor assigned to this part.
6301  send_count_to_each_proc[assigned_proc] +=
6302  num_points_in_all_processor_parts[this->myRank * num_parts + i];
6303  }
6304 
6305  delete [] num_parts_proc_assigned;
6306  delete [] sort_item_num_points_of_proc_in_part_i;
6307  delete [] sort_item_point_counts_in_parts;
6308  delete [] space_in_each_processor;
6309 
6310  // sort assignments with respect to the assigned processors.
6311  uqsort<mj_part_t, mj_part_t>(num_parts, sort_item_part_to_proc_assignment);
6312 
6313  // fill sendBuf.
6314  this->assign_send_destinations2(
6315  num_parts,
6316  sort_item_part_to_proc_assignment,
6317  coordinate_destinations,
6318  output_part_numbering_begin_index,
6319  next_future_num_parts_in_parts);
6320 
6321  delete [] sort_item_part_to_proc_assignment;
6322 }
6323 
6324 
6348 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6349  typename mj_part_t, typename mj_node_t>
6350 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6351  mj_migration_part_proc_assignment(
6352  mj_gno_t * num_points_in_all_processor_parts,
6353  mj_part_t num_parts,
6354  mj_part_t num_procs,
6355  mj_lno_t *send_count_to_each_proc,
6356  std::vector<mj_part_t> &processor_ranks_for_subcomm,
6357  std::vector<mj_part_t> *next_future_num_parts_in_parts,
6358  mj_part_t &out_num_part,
6359  std::vector<mj_part_t> &out_part_indices,
6360  mj_part_t &output_part_numbering_begin_index,
6361  int *coordinate_destinations)
6362 {
6363  processor_ranks_for_subcomm.clear();
6364  // if(this->num_local_coords > 0)
6365  if(num_procs > num_parts) {
6366  // if there are more processors than the number of current part
6367  // then processors share the existing parts.
6368  // at the end each processor will have a single part,
6369  // but a part will be shared by a group of processors.
6370  mj_part_t out_part_index = 0;
6371 
6372  this->mj_assign_proc_to_parts(
6373  num_points_in_all_processor_parts,
6374  num_parts,
6375  num_procs,
6376  send_count_to_each_proc,
6377  processor_ranks_for_subcomm,
6378  next_future_num_parts_in_parts,
6379  out_part_index,
6380  output_part_numbering_begin_index,
6381  coordinate_destinations
6382  );
6383 
6384  out_num_part = 1;
6385  out_part_indices.clear();
6386  out_part_indices.push_back(out_part_index);
6387  }
6388  else {
6389 
6390  // there are more parts than the processors.
6391  // therefore a processor will be assigned multiple parts,
6392  // the subcommunicators will only have a single processor.
6393  processor_ranks_for_subcomm.push_back(this->myRank);
6394 
6395  // since there are more parts then procs,
6396  // assign multiple parts to processors.
6397 
6398  this->mj_assign_parts_to_procs(
6399  num_points_in_all_processor_parts,
6400  num_parts,
6401  num_procs,
6402  send_count_to_each_proc,
6403  next_future_num_parts_in_parts,
6404  out_num_part,
6405  out_part_indices,
6406  output_part_numbering_begin_index,
6407  coordinate_destinations);
6408  }
6409 }
6410 
6424 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6425  typename mj_part_t, typename mj_node_t>
6426 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6427  mj_migrate_coords(
6428  mj_part_t num_procs,
6429  mj_lno_t &num_new_local_points,
6430  std::string iteration,
6431  int *coordinate_destinations,
6432  mj_part_t num_parts)
6433 {
6434 
6435 #ifdef ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
6436  if(sizeof(mj_lno_t) <= sizeof(int)) {
6437  // Cannot use Zoltan_Comm with local ordinals larger than ints.
6438  // In Zoltan_Comm_Create, the cast int(this->num_local_coords)
6439  // may overflow.
6440  ZOLTAN_COMM_OBJ *plan = NULL;
6441  MPI_Comm mpi_comm = Teuchos::getRawMpiComm(*(this->comm));
6442  int num_incoming_gnos = 0;
6443  int message_tag = 7859;
6444 
6445  this->mj_env->timerStart(MACRO_TIMERS,
6446  mj_timer_base_string + "Migration Z1PlanCreating-" + iteration);
6447  int ierr = Zoltan_Comm_Create(
6448  &plan,
6449  int(this->num_local_coords),
6450  coordinate_destinations,
6451  mpi_comm,
6452  message_tag,
6453  &num_incoming_gnos);
6454 
6455  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6456  this->mj_env->timerStop(MACRO_TIMERS,
6457  mj_timer_base_string + "Migration Z1PlanCreating-" + iteration);
6458 
6459  this->mj_env->timerStart(MACRO_TIMERS,
6460  mj_timer_base_string + "Migration Z1Migration-" + iteration);
6461 
6462  // MPI Buffers should be on Kokkos::HostSpace not Kokkos::CudaUVMSpace
6463  // Note, with UVM space, create_mirror_view does NOT create a non-UVM
6464  // view; need the explicit Host creation and deep_copy.
6465 
6466  // migrate gnos.
6467  {
6468  auto host_current_mj_gnos = Kokkos::create_mirror_view(
6469  Kokkos::HostSpace(), this->current_mj_gnos);
6470  Kokkos::deep_copy(host_current_mj_gnos, this->current_mj_gnos);
6471  Kokkos::View<mj_gno_t*, device_t> dst_gnos(
6472  Kokkos::ViewAllocateWithoutInitializing("dst_gnos"), num_incoming_gnos);
6473  auto host_dst_gnos = Kokkos::create_mirror_view(
6474  Kokkos::HostSpace(), dst_gnos);
6475  message_tag++;
6476  ierr = Zoltan_Comm_Do(
6477  plan,
6478  message_tag,
6479  (char *) host_current_mj_gnos.data(),
6480  sizeof(mj_gno_t),
6481  (char *) host_dst_gnos.data());
6482  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6483  Kokkos::deep_copy(dst_gnos, host_dst_gnos);
6484  this->current_mj_gnos = dst_gnos;
6485  }
6486 
6487  //migrate coordinates
6488  {
6489  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
6490  auto host_src_coordinates = Kokkos::create_mirror_view(
6491  Kokkos::HostSpace(), this->mj_coordinates);
6492  Kokkos::deep_copy(host_src_coordinates, this->mj_coordinates);
6493  Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
6494  dst_coordinates(Kokkos::ViewAllocateWithoutInitializing("mj_coordinates"),
6495  num_incoming_gnos, this->coord_dim);
6496  auto host_dst_coordinates = Kokkos::create_mirror_view(
6497  Kokkos::HostSpace(), dst_coordinates);
6498  for(int i = 0; i < this->coord_dim; ++i) {
6499  Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> sub_host_src_coordinates
6500  = Kokkos::subview(host_src_coordinates, Kokkos::ALL, i);
6501  Kokkos::View<mj_scalar_t *, Kokkos::HostSpace> sub_host_dst_coordinates
6502  = Kokkos::subview(host_dst_coordinates, Kokkos::ALL, i);
6503  // Note Layout Left means we can do these in contiguous blocks
6504  message_tag++;
6505  ierr = Zoltan_Comm_Do(
6506  plan,
6507  message_tag,
6508  (char *) sub_host_src_coordinates.data(),
6509  sizeof(mj_scalar_t),
6510  (char *) sub_host_dst_coordinates.data());
6511  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6512  }
6513  deep_copy(dst_coordinates, host_dst_coordinates);
6514  this->mj_coordinates = dst_coordinates;
6515  }
6516 
6517  // migrate weights.
6518  {
6519  auto host_src_weights = Kokkos::create_mirror_view(
6520  Kokkos::HostSpace(), this->mj_weights);
6521  Kokkos::deep_copy(host_src_weights, this->mj_weights);
6522  Kokkos::View<mj_scalar_t**, device_t> dst_weights(
6523  Kokkos::ViewAllocateWithoutInitializing("mj_weights"),
6524  num_incoming_gnos, this->num_weights_per_coord);
6525  auto host_dst_weights = Kokkos::create_mirror_view(dst_weights);
6526  for(int i = 0; i < this->num_weights_per_coord; ++i) {
6527  auto sub_host_src_weights
6528  = Kokkos::subview(host_src_weights, Kokkos::ALL, i);
6529  auto sub_host_dst_weights
6530  = Kokkos::subview(host_dst_weights, Kokkos::ALL, i);
6531  ArrayRCP<mj_scalar_t> sent_weight(this->num_local_coords);
6532  // Copy because of layout
6533  for(mj_lno_t n = 0; n < this->num_local_coords; ++n) {
6534  sent_weight[n] = sub_host_src_weights(n);
6535  }
6536  ArrayRCP<mj_scalar_t> received_weight(num_incoming_gnos);
6537  message_tag++;
6538  ierr = Zoltan_Comm_Do(
6539  plan,
6540  message_tag,
6541  (char *) sent_weight.getRawPtr(),
6542  sizeof(mj_scalar_t),
6543  (char *) received_weight.getRawPtr());
6544  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6545  // Again we copy by index due to layout
6546  for(mj_lno_t n = 0; n < num_incoming_gnos; ++n) {
6547  sub_host_dst_weights(n) = received_weight[n];
6548  }
6549  }
6550  deep_copy(dst_weights, host_dst_weights);
6551  this->mj_weights = dst_weights;
6552  }
6553 
6554  // migrate owners.
6555  {
6556  // Note that owners we kept on Serial
6557  Kokkos::View<int *, Kokkos::HostSpace> dst_owners_of_coordinate(
6558  Kokkos::ViewAllocateWithoutInitializing("owner_of_coordinate"),
6559  num_incoming_gnos);
6560  message_tag++;
6561  ierr = Zoltan_Comm_Do(
6562  plan,
6563  message_tag,
6564  (char *) owner_of_coordinate.data(),
6565  sizeof(int),
6566  (char *) dst_owners_of_coordinate.data());
6567  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6568  this->owner_of_coordinate = dst_owners_of_coordinate;
6569  }
6570 
6571  // if num procs is less than num parts,
6572  // we need the part assigment arrays as well, since
6573  // there will be multiple parts in processor.
6574  {
6575  auto host_src_assigned_part_ids = Kokkos::create_mirror_view(
6576  Kokkos::HostSpace(), this->assigned_part_ids);
6577  Kokkos::deep_copy(host_src_assigned_part_ids, this->assigned_part_ids);
6578  Kokkos::View<int *, device_t> dst_assigned_part_ids(
6579  Kokkos::ViewAllocateWithoutInitializing("assigned_part_ids"),
6580  num_incoming_gnos);
6581  auto host_dst_assigned_part_ids = Kokkos::create_mirror_view(
6582  Kokkos::HostSpace(), dst_assigned_part_ids);
6583  mj_part_t *new_parts = new mj_part_t[num_incoming_gnos];
6584  if(num_procs < num_parts) {
6585  message_tag++;
6586  ierr = Zoltan_Comm_Do(
6587  plan,
6588  message_tag,
6589  (char *) host_src_assigned_part_ids.data(),
6590  sizeof(mj_part_t),
6591  (char *) host_dst_assigned_part_ids.data());
6592  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6593  Kokkos::deep_copy(dst_assigned_part_ids, host_dst_assigned_part_ids);
6594  }
6595  // In original code this would just assign to an uninitialized array
6596  // if num_procs < num_parts. We're doing the same here.
6597  this->assigned_part_ids = dst_assigned_part_ids;
6598  }
6599 
6600  ierr = Zoltan_Comm_Destroy(&plan);
6601  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6602  num_new_local_points = num_incoming_gnos;
6603  this->mj_env->timerStop(MACRO_TIMERS,
6604  mj_timer_base_string + "Migration Z1Migration-" + iteration);
6605  }
6606  else
6607 #endif // ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
6608  {
6609  this->mj_env->timerStart(MACRO_TIMERS, mj_timer_base_string +
6610  "Migration DistributorPlanCreating-" + iteration);
6611 
6612  Tpetra::Distributor distributor(this->comm);
6613  ArrayView<const mj_part_t> destinations( coordinate_destinations,
6614  this->num_local_coords);
6615  mj_lno_t num_incoming_gnos = distributor.createFromSends(destinations);
6616  this->mj_env->timerStop(MACRO_TIMERS, mj_timer_base_string +
6617  "Migration DistributorPlanCreating-" + iteration);
6618  this->mj_env->timerStart(MACRO_TIMERS, mj_timer_base_string +
6619  "Migration DistributorMigration-" + iteration);
6620 
6621  // note MPI buffers should all be on Kokkos::HostSpace and not
6622  // Kokkos::CudaUVMSpace.
6623  // Note, with UVM space, create_mirror_view does NOT create a non-UVM
6624  // view; need the explicit Host creation and deep_copy.
6625  // migrate gnos.
6626  {
6627  Kokkos::View<mj_gno_t*, Kokkos::HostSpace> received_gnos(
6628  Kokkos::ViewAllocateWithoutInitializing("received_gnos"),
6629  num_incoming_gnos);
6630 
6631  Kokkos::View<mj_gno_t*, Kokkos::HostSpace> sent_gnos(
6632  Kokkos::ViewAllocateWithoutInitializing("sent_gnos"),
6633  this->current_mj_gnos.extent(0));
6634  Kokkos::deep_copy(sent_gnos, this->current_mj_gnos);
6635 
6636  distributor.doPostsAndWaits(sent_gnos, 1, received_gnos);
6637 
6638  this->current_mj_gnos = Kokkos::View<mj_gno_t*, device_t>(
6639  Kokkos::ViewAllocateWithoutInitializing("gids"), num_incoming_gnos);
6640 
6641  Kokkos::deep_copy(this->current_mj_gnos, received_gnos);
6642  }
6643 
6644  // migrate coordinates
6645  // coordinates in MJ are LayoutLeft since Tpetra Multivector is LayoutLeft
6646  Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
6647  dst_coordinates("mj_coordinates", num_incoming_gnos, this->coord_dim);
6648 
6649  Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, Kokkos::HostSpace>
6650  host_src_coordinates(
6651  Kokkos::ViewAllocateWithoutInitializing("host_coords"),
6652  this->mj_coordinates.extent(0), this->mj_coordinates.extent(1));
6653  Kokkos::deep_copy(host_src_coordinates, this->mj_coordinates);
6654 
6655  Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> received_coord(
6656  Kokkos::ViewAllocateWithoutInitializing("received_coord"),
6657  num_incoming_gnos);
6658 
6659  for(int i = 0; i < this->coord_dim; ++i) {
6660 
6661  // Note Layout Left means we can do these in contiguous blocks
6662 
6663  Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> sent_coord
6664  = Kokkos::subview(host_src_coordinates, Kokkos::ALL, i);
6665 
6666  distributor.doPostsAndWaits(sent_coord, 1, received_coord);
6667 
6668  Kokkos::deep_copy(Kokkos::subview(dst_coordinates, Kokkos::ALL, i),
6669  received_coord);
6670 
6671  // Kokkos::deep_copy will fence, I think, so it should be safe
6672  // to reuse received_coord in the next lop iteration
6673  }
6674  this->mj_coordinates = dst_coordinates;
6675 
6676  // migrate weights.
6677  Kokkos::View<mj_scalar_t**, device_t> dst_weights(
6678  "mj_weights", num_incoming_gnos, this->num_weights_per_coord);
6679  auto host_dst_weights = Kokkos::create_mirror_view(Kokkos::HostSpace(),
6680  dst_weights);
6681 
6682  auto host_src_weights = Kokkos::create_mirror_view_and_copy(
6683  Kokkos::HostSpace(), this->mj_weights);
6684 
6685  // contiguous buffers to gather potentially strided data
6686  Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> sent_weight(
6687  Kokkos::ViewAllocateWithoutInitializing("send_weight_buffer"),
6688  this->num_local_coords);
6689 
6690  Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> received_weight(
6691  Kokkos::ViewAllocateWithoutInitializing("received_weight_buffer"),
6692  num_incoming_gnos);
6693 
6694  for(int i = 0; i < this->num_weights_per_coord; ++i) {
6695 
6696  auto sub_host_src_weights
6697  = Kokkos::subview(host_src_weights, Kokkos::ALL, i);
6698 
6699  auto sub_host_dst_weights
6700  = Kokkos::subview(host_dst_weights, Kokkos::ALL, i);
6701 
6702 
6703  // Layout Right means the weights are not contiguous
6704  // However we don't have any systems setup with more than 1 weight so
6705  // really I have not tested any of this code with num weights > 1.
6706  // I think this is the right thing to do.
6707  for(mj_lno_t n = 0; n < this->num_local_coords; ++n) {
6708  sent_weight[n] = sub_host_src_weights(n);
6709  }
6710 
6711  distributor.doPostsAndWaits(sent_weight, 1, received_weight);
6712 
6713  // Again we copy by index due to layout
6714  for(mj_lno_t n = 0; n < num_incoming_gnos; ++n) {
6715  sub_host_dst_weights(n) = received_weight[n];
6716  }
6717  }
6718  Kokkos::deep_copy(dst_weights, host_dst_weights);
6719  this->mj_weights = dst_weights;
6720 
6721  // migrate owners
6722  {
6723  // Note owners we kept on Serial
6724  Kokkos::View<int *, Kokkos::HostSpace> received_owners(
6725  Kokkos::ViewAllocateWithoutInitializing("owner_of_coordinate"),
6726  num_incoming_gnos);
6727 
6728  distributor.doPostsAndWaits(owner_of_coordinate, 1, received_owners);
6729 
6730  this->owner_of_coordinate = received_owners;
6731  }
6732 
6733  // if num procs is less than num parts,
6734  // we need the part assigment arrays as well, since
6735  // there will be multiple parts in processor.
6736  if(num_procs < num_parts) {
6737  Kokkos::View<mj_part_t*, Kokkos::HostSpace> sent_partids(
6738  Kokkos::ViewAllocateWithoutInitializing("host_parts"),
6739  this->assigned_part_ids.extent(0));
6740  Kokkos::deep_copy(sent_partids, assigned_part_ids);
6741 
6742  Kokkos::View<mj_part_t*, Kokkos::HostSpace> received_partids(
6743  Kokkos::ViewAllocateWithoutInitializing("received_partids"),
6744  num_incoming_gnos);
6745 
6746  distributor.doPostsAndWaits(sent_partids, 1, received_partids);
6747 
6748  this->assigned_part_ids = Kokkos::View<mj_part_t *, device_t>
6749  ("assigned_part_ids", num_incoming_gnos);
6750  Kokkos::deep_copy(this->assigned_part_ids, received_partids);
6751  }
6752  else {
6753  this->assigned_part_ids = Kokkos::View<mj_part_t *, device_t>
6754  ("assigned_part_ids", num_incoming_gnos);
6755  }
6756  this->mj_env->timerStop(MACRO_TIMERS, "" + mj_timer_base_string +
6757  "Migration DistributorMigration-" + iteration);
6758 
6759  num_new_local_points = num_incoming_gnos;
6760  }
6761 }
6762 
6768 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6769  typename mj_part_t, typename mj_node_t>
6770 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6771  create_sub_communicator(std::vector<mj_part_t> &processor_ranks_for_subcomm)
6772 {
6773  mj_part_t group_size = processor_ranks_for_subcomm.size();
6774  mj_part_t *ids = new mj_part_t[group_size];
6775  for(mj_part_t i = 0; i < group_size; ++i) {
6776  ids[i] = processor_ranks_for_subcomm[i];
6777  }
6778  ArrayView<const mj_part_t> idView(ids, group_size);
6779  this->comm = this->comm->createSubcommunicator(idView);
6780  delete [] ids;
6781 }
6782 
6788 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6789  typename mj_part_t, typename mj_node_t>
6790 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6791  fill_permutation_array(
6792  mj_part_t output_num_parts,
6793  mj_part_t num_parts)
6794 {
6795  // if there is single output part, then simply fill the permutation array.
6796  if(output_num_parts == 1) {
6797  auto local_new_coordinate_permutations = this->new_coordinate_permutations;
6798  Kokkos::parallel_for(
6799  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
6800  (0, this->num_local_coords),
6801  KOKKOS_LAMBDA(mj_lno_t i) {
6802  local_new_coordinate_permutations(i) = i;
6803  });
6804  auto local_new_part_xadj = this->new_part_xadj;
6805  auto local_num_local_coords = this->num_local_coords;
6806  Kokkos::parallel_for(
6807  Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0,1),
6808  KOKKOS_LAMBDA(int dummy) {
6809  local_new_part_xadj(0) = local_num_local_coords;
6810  });
6811  }
6812  else {
6813  auto local_num_local_coords = this->num_local_coords;
6814  auto local_assigned_part_ids = this->assigned_part_ids;
6815  auto local_new_part_xadj = this->new_part_xadj;
6816  auto local_new_coordinate_permutations = this->new_coordinate_permutations;
6817 
6818  // part shift holds the which part number an old part number corresponds to.
6819  Kokkos::View<mj_part_t*, device_t> part_shifts("part_shifts", num_parts);
6820 
6821  // otherwise we need to count how many points are there in each part.
6822  // we allocate here as num_parts, because the sent partids are up to
6823  // num_parts, although there are outout_num_parts different part.
6824  Kokkos::View<mj_lno_t*, device_t> num_points_in_parts(
6825  "num_points_in_parts", num_parts);
6826 
6827  Kokkos::parallel_for(
6828  Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0,1),
6829  KOKKOS_LAMBDA(int dummy) {
6830 
6831  for(mj_lno_t i = 0; i < local_num_local_coords; ++i) {
6832  mj_part_t ii = local_assigned_part_ids(i);
6833  ++num_points_in_parts(ii);
6834  }
6835 
6836  // write the end points of the parts.
6837  mj_part_t p = 0;
6838  mj_lno_t prev_index = 0;
6839  for(mj_part_t i = 0; i < num_parts; ++i) {
6840  if(num_points_in_parts(i) > 0) {
6841  local_new_part_xadj(p) = prev_index + num_points_in_parts(i);
6842  prev_index += num_points_in_parts(i);
6843  part_shifts(i) = p++;
6844  }
6845  }
6846 
6847  // for the rest of the parts write the end index as end point.
6848  mj_part_t assigned_num_parts = p - 1;
6849  for(;p < num_parts; ++p) {
6850  local_new_part_xadj(p) =
6851  local_new_part_xadj(assigned_num_parts);
6852  }
6853  for(mj_part_t i = 0; i < output_num_parts; ++i) {
6854  num_points_in_parts(i) = local_new_part_xadj(i);
6855  }
6856 
6857  // write the permutation array here.
6858  // get the part of the coordinate i, shift it to obtain the new part number.
6859  // assign it to the end of the new part numbers pointer.
6860  for(mj_lno_t i = local_num_local_coords - 1; i >= 0; --i) {
6861  mj_part_t part =
6862  part_shifts[mj_part_t(local_assigned_part_ids(i))];
6863  local_new_coordinate_permutations(--num_points_in_parts[part]) = i;
6864  }
6865  });
6866  }
6867 }
6868 
6893 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6894  typename mj_part_t, typename mj_node_t>
6895 bool AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6896  mj_perform_migration(
6897  mj_part_t input_num_parts,
6898  mj_part_t &output_num_parts,
6899  std::vector<mj_part_t> *next_future_num_parts_in_parts,
6900  mj_part_t &output_part_begin_index,
6901  size_t migration_reduce_all_population,
6902  mj_lno_t num_coords_for_last_dim_part,
6903  std::string iteration,
6904  RCP<mj_partBoxVector_t> &input_part_boxes,
6905  RCP<mj_partBoxVector_t> &output_part_boxes)
6906 {
6907  mj_part_t num_procs = this->comm->getSize();
6908  this->myRank = this->comm->getRank();
6909 
6910  // this array holds how many points each processor has in each part.
6911  // to access how many points processor i has on part j,
6912  // num_points_in_all_processor_parts[i * num_parts + j]
6913  mj_gno_t *num_points_in_all_processor_parts =
6914  new mj_gno_t[input_num_parts * (num_procs + 1)];
6915 
6916  // get the number of coordinates in each part in each processor.
6917  this->get_processor_num_points_in_parts(
6918  num_procs,
6919  input_num_parts,
6920  num_points_in_all_processor_parts);
6921 
6922  // check if migration will be performed or not.
6923  if(!this->mj_check_to_migrate(
6924  migration_reduce_all_population,
6925  num_coords_for_last_dim_part,
6926  num_procs,
6927  input_num_parts,
6928  num_points_in_all_processor_parts)) {
6929  delete [] num_points_in_all_processor_parts;
6930  return false;
6931  }
6932 
6933  mj_lno_t *send_count_to_each_proc = NULL;
6934  int *coordinate_destinations = new int[this->num_local_coords];
6935  send_count_to_each_proc = new mj_lno_t[num_procs];
6936 
6937  for(int i = 0; i < num_procs; ++i) {
6938  send_count_to_each_proc[i] = 0;
6939  }
6940 
6941  std::vector<mj_part_t> processor_ranks_for_subcomm;
6942  std::vector<mj_part_t> out_part_indices;
6943 
6944  // determine which processors are assigned to which parts
6945  this->mj_migration_part_proc_assignment(
6946  num_points_in_all_processor_parts,
6947  input_num_parts,
6948  num_procs,
6949  send_count_to_each_proc,
6950  processor_ranks_for_subcomm,
6951  next_future_num_parts_in_parts,
6952  output_num_parts,
6953  out_part_indices,
6954  output_part_begin_index,
6955  coordinate_destinations);
6956 
6957  delete [] send_count_to_each_proc;
6958  std::vector <mj_part_t> tmpv;
6959 
6960  std::sort (out_part_indices.begin(), out_part_indices.end());
6961  mj_part_t outP = out_part_indices.size();
6962  mj_gno_t new_global_num_points = 0;
6963  mj_gno_t *global_num_points_in_parts =
6964  num_points_in_all_processor_parts + num_procs * input_num_parts;
6965 
6966  if(this->mj_keep_part_boxes) {
6967  input_part_boxes->clear();
6968  }
6969 
6970  // now we calculate the new values for next_future_num_parts_in_parts.
6971  // same for the part boxes.
6972  for(mj_part_t i = 0; i < outP; ++i) {
6973  mj_part_t ind = out_part_indices[i];
6974  new_global_num_points += global_num_points_in_parts[ind];
6975  tmpv.push_back((*next_future_num_parts_in_parts)[ind]);
6976  if(this->mj_keep_part_boxes) {
6977  input_part_boxes->push_back((*output_part_boxes)[ind]);
6978  }
6979  }
6980 
6981  // swap the input and output part boxes.
6982  if(this->mj_keep_part_boxes) {
6983  RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
6984  input_part_boxes = output_part_boxes;
6985  output_part_boxes = tmpPartBoxes;
6986  }
6987  next_future_num_parts_in_parts->clear();
6988  for(mj_part_t i = 0; i < outP; ++i) {
6989  mj_part_t p = tmpv[i];
6990  next_future_num_parts_in_parts->push_back(p);
6991  }
6992 
6993  delete [] num_points_in_all_processor_parts;
6994 
6995  mj_lno_t num_new_local_points = 0;
6996  //perform the actual migration operation here.
6997  this->mj_migrate_coords(
6998  num_procs,
6999  num_new_local_points,
7000  iteration,
7001  coordinate_destinations,
7002  input_num_parts);
7003 
7004  delete [] coordinate_destinations;
7005  if(this->num_local_coords != num_new_local_points) {
7006  this->new_coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>
7007  (Kokkos::ViewAllocateWithoutInitializing("new_coordinate_permutations"),
7008  num_new_local_points);
7009  this->coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>
7010  (Kokkos::ViewAllocateWithoutInitializing("coordinate_permutations"),
7011  num_new_local_points);
7012  }
7013  this->num_local_coords = num_new_local_points;
7014  this->num_global_coords = new_global_num_points;
7015 
7016  // create subcommunicator.
7017  this->create_sub_communicator(processor_ranks_for_subcomm);
7018 
7019  processor_ranks_for_subcomm.clear();
7020 
7021  // fill the new permutation arrays.
7022  this->fill_permutation_array(output_num_parts, input_num_parts);
7023 
7024  return true;
7025 }
7026 
7045 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
7046  typename mj_part_t, typename mj_node_t>
7047 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
7048  create_consistent_chunks(
7049  mj_part_t num_parts,
7050  Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
7051  Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
7052  mj_lno_t coordinate_begin,
7053  mj_lno_t coordinate_end,
7054  Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
7055  Kokkos::View<mj_lno_t *, device_t> & out_part_xadj,
7056  int coordInd,
7057  bool longest_dim_part,
7058  uSignedSortItem<int, mj_scalar_t, char> * p_coord_dimension_range_sorted)
7059 {
7060  // Note that this method is only used by task mapper
7061  // All code in this file has been verified to run with UVM off by running
7062  // mj tests and task mapper tests with UVM off. However for this particular
7063  // method I did not do much for UVM off. I heavily use device to host copies
7064  // and more or less preserve the original logic. Due to the handling of
7065  // arrays it will be a bit of work to convert this to as better form.
7066  // Since it's only relevant to task mapper and I wasn't sure how much priority
7067  // to give it, I put that on hold until further discussion.
7068  mj_part_t no_cuts = num_parts - 1;
7069 
7070  // now if the rectilinear partitioning is allowed we decide how
7071  // much weight each thread should put to left and right.
7072  if(this->distribute_points_on_cut_lines) {
7073  auto local_thread_cut_line_weight_to_put_left =
7074  this->thread_cut_line_weight_to_put_left;
7075  auto local_thread_part_weight_work =
7076  this->thread_part_weight_work;
7077  auto local_sEpsilon = this->sEpsilon;
7078 
7079  Kokkos::parallel_for(
7080  Kokkos::RangePolicy<typename mj_node_t::execution_space,
7081  mj_part_t> (0, no_cuts), KOKKOS_LAMBDA (mj_part_t i) {
7082  // the left to be put on the left of the cut.
7083  mj_scalar_t left_weight = used_local_cut_line_weight_to_left(i);
7084  if(left_weight > local_sEpsilon) {
7085  // the weight of thread ii on cut.
7086  mj_scalar_t thread_ii_weight_on_cut =
7087  local_thread_part_weight_work(i * 2 + 1) -
7088  local_thread_part_weight_work(i * 2);
7089  if(thread_ii_weight_on_cut < left_weight) {
7090  local_thread_cut_line_weight_to_put_left(i) =
7091  thread_ii_weight_on_cut;
7092  }
7093  else {
7094  local_thread_cut_line_weight_to_put_left(i) = left_weight;
7095  }
7096  }
7097  else {
7098  local_thread_cut_line_weight_to_put_left(i) = 0;
7099  }
7100  });
7101 
7102  if(no_cuts > 0) {
7103  auto local_least_signifiance = least_signifiance;
7104  auto local_significance_mul = significance_mul;
7105  Kokkos::parallel_for(
7106  Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
7107  (0, 1), KOKKOS_LAMBDA (int dummy) {
7108  // this is a special case. If cutlines share the same coordinate,
7109  // their weights are equal.
7110  // we need to adjust the ratio for that.
7111  for(mj_part_t i = no_cuts - 1; i > 0 ; --i) {
7112  mj_scalar_t cut1 = current_concurrent_cut_coordinate(i-1);
7113  mj_scalar_t cut2 = current_concurrent_cut_coordinate(i);
7114  mj_scalar_t delta = cut2 - cut1;
7115  mj_scalar_t abs_delta = (delta > 0) ? delta : -delta;
7116  if(abs_delta < local_sEpsilon) {
7117  local_thread_cut_line_weight_to_put_left(i) -=
7118  local_thread_cut_line_weight_to_put_left(i - 1);
7119  }
7120  local_thread_cut_line_weight_to_put_left(i) =
7121  static_cast<long long>((local_thread_cut_line_weight_to_put_left(i) +
7122  local_least_signifiance) * local_significance_mul) /
7123  static_cast<mj_scalar_t>(local_significance_mul);
7124  }
7125  });
7126  }
7127  }
7128 
7129  auto local_thread_point_counts = this->thread_point_counts;
7130  Kokkos::parallel_for(
7131  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
7132  (0, num_parts), KOKKOS_LAMBDA (mj_part_t i) {
7133  local_thread_point_counts(i) = 0;
7134  });
7135 
7136  // for this specific case we dont want to distribute the points along the
7137  // cut position randomly, as we need a specific ordering of them. Instead,
7138  // we put the coordinates into a sort item, where we sort those
7139  // using the coordinates of points on other dimensions and the index.
7140 
7141  // some of the cuts might share the same position.
7142  // in this case, if cut i and cut j share the same position
7143  // cut_map[i] = cut_map[j] = sort item index.
7144  mj_part_t *cut_map = new mj_part_t[no_cuts];
7145 
7146  typedef uMultiSortItem<mj_lno_t, int, mj_scalar_t> multiSItem;
7147  typedef std::vector< multiSItem > multiSVector;
7148  typedef std::vector<multiSVector> multiS2Vector;
7149 
7150  // to keep track of the memory allocated.
7151  std::vector<mj_scalar_t *>allocated_memory;
7152 
7153  // vector for which the coordinates will be sorted.
7154  multiS2Vector sort_vector_points_on_cut;
7155 
7156  // the number of cuts that have different coordinates.
7157  mj_part_t different_cut_count = 1;
7158  cut_map[0] = 0;
7159 
7160  // now we insert 1 sort vector for all cuts on the different
7161  // positins.if multiple cuts are on the same position,
7162  // they share sort vectors.
7163  multiSVector tmpMultiSVector;
7164  sort_vector_points_on_cut.push_back(tmpMultiSVector);
7165 
7166  auto local_current_concurrent_cut_coordinate =
7167  current_concurrent_cut_coordinate;
7168  auto host_current_concurrent_cut_coordinate =
7169  Kokkos::create_mirror_view(local_current_concurrent_cut_coordinate);
7170  Kokkos::deep_copy(host_current_concurrent_cut_coordinate,
7171  local_current_concurrent_cut_coordinate);
7172 
7173  for(mj_part_t i = 1; i < no_cuts ; ++i) {
7174  // if cuts share the same cut coordinates
7175  // set the cutmap accordingly.
7176  if(std::abs(host_current_concurrent_cut_coordinate(i) -
7177  host_current_concurrent_cut_coordinate(i-1)) < this->sEpsilon) {
7178  cut_map[i] = cut_map[i-1];
7179  }
7180  else {
7181  cut_map[i] = different_cut_count++;
7182  multiSVector tmp2MultiSVector;
7183  sort_vector_points_on_cut.push_back(tmp2MultiSVector);
7184  }
7185  }
7186  Kokkos::deep_copy(current_concurrent_cut_coordinate,
7187  host_current_concurrent_cut_coordinate);
7188 
7189  // now the actual part assigment.
7190  auto host_coordinate_permutations =
7191  Kokkos::create_mirror_view(coordinate_permutations);
7192  Kokkos::deep_copy(host_coordinate_permutations, coordinate_permutations);
7193 
7194  auto host_assigned_part_ids = Kokkos::create_mirror_view(assigned_part_ids);
7195  Kokkos::deep_copy(host_assigned_part_ids, assigned_part_ids);
7196 
7197  auto host_mj_coordinates = Kokkos::create_mirror_view(mj_coordinates);
7198  Kokkos::deep_copy(host_mj_coordinates, mj_coordinates);
7199 
7200  auto host_thread_point_counts = Kokkos::create_mirror_view(thread_point_counts);
7201  Kokkos::deep_copy(host_thread_point_counts, thread_point_counts);
7202 
7203  auto local_coord_dim = this->coord_dim;
7204 
7205  for(mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii) {
7206  mj_lno_t i = host_coordinate_permutations(ii);
7207  mj_part_t pp = host_assigned_part_ids(i);
7208  mj_part_t p = pp / 2;
7209  // if the coordinate is on a cut.
7210  if(pp % 2 == 1 ) {
7211  mj_scalar_t *vals = new mj_scalar_t[local_coord_dim -1];
7212  allocated_memory.push_back(vals);
7213 
7214  // we insert the coordinates to the sort item here.
7215  int val_ind = 0;
7216 
7217  if(longest_dim_part) {
7218  // std::cout << std::endl << std::endl;
7219  for(int dim = local_coord_dim - 2; dim >= 0; --dim) {
7220  // uSignedSortItem<int, mj_scalar_t, char>
7221  // *p_coord_dimension_range_sorted
7222  int next_largest_coord_dim = p_coord_dimension_range_sorted[dim].id;
7223  // std::cout << "next_largest_coord_dim: " <<
7224  // next_largest_coord_dim << " ";
7225  // Note refactor in progress
7226  vals[val_ind++] =
7227  host_mj_coordinates(i,next_largest_coord_dim);
7228  }
7229  }
7230  else {
7231  for(int dim = coordInd + 1; dim < local_coord_dim; ++dim) {
7232  vals[val_ind++] = host_mj_coordinates(i,dim);
7233  }
7234  for(int dim = 0; dim < coordInd; ++dim) {
7235  vals[val_ind++] = host_mj_coordinates(i,dim);
7236  }
7237  }
7238 
7239  multiSItem tempSortItem(i, local_coord_dim -1, vals);
7240  //insert the point to the sort vector pointed by the cut_map[p].
7241  mj_part_t cmap = cut_map[p];
7242  sort_vector_points_on_cut[cmap].push_back(tempSortItem);
7243  }
7244  else {
7245  //if it is not on the cut, simple sorting.
7246  ++host_thread_point_counts(p);
7247  host_assigned_part_ids(i) = p;
7248  }
7249  }
7250 
7251  // sort all the sort vectors.
7252  for(mj_part_t i = 0; i < different_cut_count; ++i) {
7253  std::sort (sort_vector_points_on_cut[i].begin(),
7254  sort_vector_points_on_cut[i].end());
7255  }
7256 
7257  mj_part_t previous_cut_map = cut_map[0];
7258 
7259  auto host_thread_cut_line_weight_to_put_left =
7260  Kokkos::create_mirror_view(thread_cut_line_weight_to_put_left);
7261  Kokkos::deep_copy(host_thread_cut_line_weight_to_put_left,
7262  thread_cut_line_weight_to_put_left);
7263 
7264  auto host_mj_weights = Kokkos::create_mirror_view(mj_weights);
7265  Kokkos::deep_copy(host_mj_weights, mj_weights);
7266 
7267  // this is how much previous part owns the weight of the current part.
7268  // when target part weight is 1.6, and the part on the left is given 2,
7269  // the left has an extra 0.4, while the right has missing 0.4 from the
7270  // previous cut.
7271  // This parameter is used to balance this issues.
7272  // in the above example weight_stolen_from_previous_part will be 0.4.
7273  // if the left part target is 2.2 but it is given 2,
7274  // then weight_stolen_from_previous_part will be -0.2.
7275  mj_scalar_t weight_stolen_from_previous_part = 0;
7276  for(mj_part_t p = 0; p < no_cuts; ++p) {
7277  mj_part_t mapped_cut = cut_map[p];
7278 
7279  // if previous cut map is done, and it does not have the same index,
7280  // then assign all points left on that cut to its right.
7281  if(previous_cut_map != mapped_cut) {
7282  mj_lno_t sort_vector_end = (mj_lno_t)
7283  sort_vector_points_on_cut[previous_cut_map].size() - 1;
7284  for(; sort_vector_end >= 0; --sort_vector_end) {
7285  multiSItem t =
7286  sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
7287  mj_lno_t i = t.index;
7288  ++host_thread_point_counts(p);
7289  host_assigned_part_ids(i) = p;
7290  }
7291  sort_vector_points_on_cut[previous_cut_map].clear();
7292  }
7293 
7294  // TODO: MD: I dont remember why I have it reverse order here.
7295  mj_lno_t sort_vector_end = (mj_lno_t)
7296  sort_vector_points_on_cut[mapped_cut].size() - 1;
7297  // mj_lno_t sort_vector_begin= 0;
7298  // mj_lno_t sort_vector_size =
7299  // (mj_lno_t)sort_vector_points_on_cut[mapped_cut].size();
7300 
7301  // TODO commented for reverse order
7302  for(; sort_vector_end >= 0; --sort_vector_end) {
7303  // for(; sort_vector_begin < sort_vector_size; ++sort_vector_begin) {
7304  // TODO COMMENTED FOR REVERSE ORDER
7305  multiSItem t = sort_vector_points_on_cut[mapped_cut][sort_vector_end];
7306  //multiSItem t = sort_vector_points_on_cut[mapped_cut][sort_vector_begin];
7307  mj_lno_t i = t.index;
7308  mj_scalar_t w = this->mj_uniform_weights(0) ? 1 :
7309  this->mj_weights(i,0);
7310  // part p has enough space for point i, then put it to point i.
7311  if(host_thread_cut_line_weight_to_put_left(p) +
7312  weight_stolen_from_previous_part> this->sEpsilon &&
7313  host_thread_cut_line_weight_to_put_left(p) +
7314  weight_stolen_from_previous_part -
7315  std::abs(host_thread_cut_line_weight_to_put_left(p) +
7316  weight_stolen_from_previous_part - w)> this->sEpsilon)
7317  {
7318  host_thread_cut_line_weight_to_put_left(p) -= w;
7319 
7320  sort_vector_points_on_cut[mapped_cut].pop_back();
7321 
7322  ++host_thread_point_counts(p);
7323  host_assigned_part_ids(i) = p;
7324  // if putting this weight to left overweights the left cut, then
7325  // increase the space for the next cut using
7326  // weight_stolen_from_previous_part.
7327  if(p < no_cuts - 1 &&
7328  host_thread_cut_line_weight_to_put_left(p) < this->sEpsilon) {
7329  if(mapped_cut == cut_map[p + 1] ) {
7330  // if the cut before the cut indexed at p was also at the same
7331  // position special case, as we handle the weight differently here.
7332  if(previous_cut_map != mapped_cut) {
7333  weight_stolen_from_previous_part =
7334  host_thread_cut_line_weight_to_put_left(p);
7335  }
7336  else {
7337  // if the cut before the cut indexed at p was also at the same
7338  // position we assign extra weights cumulatively in this case.
7339  weight_stolen_from_previous_part +=
7340  host_thread_cut_line_weight_to_put_left(p);
7341  }
7342  }
7343  else{
7344  weight_stolen_from_previous_part =
7345  -host_thread_cut_line_weight_to_put_left(p);
7346  }
7347  // end assignment for part p
7348  break;
7349  }
7350  } else {
7351  // if part p does not have enough space for this point
7352  // and if there is another cut sharing the same positon,
7353  // again increase the space for the next
7354  if(p < no_cuts - 1 && mapped_cut == cut_map[p + 1]) {
7355  if(previous_cut_map != mapped_cut) {
7356  weight_stolen_from_previous_part =
7357  host_thread_cut_line_weight_to_put_left(p);
7358  }
7359  else {
7360  weight_stolen_from_previous_part +=
7361  host_thread_cut_line_weight_to_put_left(p);
7362  }
7363  }
7364  else{
7365  weight_stolen_from_previous_part =
7366  -host_thread_cut_line_weight_to_put_left(p);
7367  }
7368  // end assignment for part p
7369  break;
7370  }
7371  }
7372  previous_cut_map = mapped_cut;
7373  }
7374 
7375  // TODO commented for reverse order
7376  // put everything left on the last cut to the last part.
7377  mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[
7378  previous_cut_map].size() - 1;
7379 
7380  // mj_lno_t sort_vector_begin= 0;
7381  // mj_lno_t sort_vector_size = (mj_lno_t)
7382  // sort_vector_points_on_cut[previous_cut_map].size();
7383  // TODO commented for reverse order
7384  for(; sort_vector_end >= 0; --sort_vector_end) {
7385  // TODO commented for reverse order
7386  multiSItem t = sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
7387  // multiSItem t =
7388  // sort_vector_points_on_cut[previous_cut_map][sort_vector_begin];
7389  mj_lno_t i = t.index;
7390  ++host_thread_point_counts(no_cuts);
7391  host_assigned_part_ids(i) = no_cuts;
7392  }
7393 
7394  sort_vector_points_on_cut[previous_cut_map].clear();
7395  delete [] cut_map;
7396 
7397  //free the memory allocated for vertex sort items .
7398  mj_lno_t vSize = (mj_lno_t) allocated_memory.size();
7399  for(mj_lno_t i = 0; i < vSize; ++i) {
7400  delete [] allocated_memory[i];
7401  }
7402 
7403  auto local_out_part_xadj = out_part_xadj;
7404  auto host_out_part_xadj = Kokkos::create_mirror_view(local_out_part_xadj);
7405  Kokkos::deep_copy(host_out_part_xadj, out_part_xadj);
7406 
7407  // creation of part_xadj as in usual case.
7408  for(mj_part_t j = 0; j < num_parts; ++j) {
7409  host_out_part_xadj(j) = host_thread_point_counts(j);
7410  host_thread_point_counts(j) = 0;
7411  }
7412 
7413  // perform prefix sum for num_points in parts.
7414  for(mj_part_t j = 1; j < num_parts; ++j) {
7415  host_out_part_xadj(j) += host_out_part_xadj(j - 1);
7416  }
7417 
7418  // shift the num points in threads thread to obtain the
7419  // beginning index of each thread's private space.
7420  for(mj_part_t j = 1; j < num_parts; ++j) {
7421  host_thread_point_counts(j) += host_out_part_xadj(j - 1);
7422  }
7423 
7424  auto host_new_coordinate_permutations =
7425  Kokkos::create_mirror_view(new_coordinate_permutations);
7426  Kokkos::deep_copy(host_new_coordinate_permutations,
7427  new_coordinate_permutations);
7428 
7429  // now thread gets the coordinate and writes the index of coordinate to
7430  // the permutation array using the part index we calculated.
7431  for(mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii) {
7432  mj_lno_t i = host_coordinate_permutations(ii);
7433  mj_part_t p = host_assigned_part_ids(i);
7434  host_new_coordinate_permutations(coordinate_begin +
7435  host_thread_point_counts(p)++) = i;
7436  }
7437 
7438  Kokkos::deep_copy(thread_point_counts, host_thread_point_counts);
7439  Kokkos::deep_copy(new_coordinate_permutations,
7440  host_new_coordinate_permutations);
7441  Kokkos::deep_copy(local_out_part_xadj, host_out_part_xadj);
7442 }
7443 
7453 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
7454  typename mj_part_t, typename mj_node_t>
7455 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
7456  set_final_parts(
7457  mj_part_t current_num_parts,
7458  mj_part_t output_part_begin_index,
7459  RCP<mj_partBoxVector_t> &output_part_boxes,
7460  bool is_data_ever_migrated)
7461 {
7462  this->mj_env->timerStart(MACRO_TIMERS,
7463  mj_timer_base_string + "Part_Assignment");
7464 
7465  auto local_part_xadj = part_xadj;
7466  auto local_mj_keep_part_boxes = mj_keep_part_boxes;
7467  auto local_coordinate_permutations = coordinate_permutations;
7468  auto local_assigned_part_ids = assigned_part_ids;
7469 
7470  if(local_mj_keep_part_boxes) {
7471  for(int i = 0; i < current_num_parts; ++i) {
7472  (*output_part_boxes)[i].setpId(i + output_part_begin_index);
7473  }
7474  }
7475 
7476  Kokkos::TeamPolicy<typename mj_node_t::execution_space> policy(
7477  current_num_parts, Kokkos::AUTO());
7478  typedef typename Kokkos::TeamPolicy<typename mj_node_t::execution_space>::
7480  Kokkos::parallel_for(policy, KOKKOS_LAMBDA(member_type team_member) {
7481  int i = team_member.league_rank();
7482  Kokkos::parallel_for(Kokkos::TeamThreadRange (team_member, (i != 0) ?
7483  local_part_xadj(i-1) : 0, local_part_xadj(i)),
7484  [=] (mj_lno_t ii) {
7485  mj_lno_t k = local_coordinate_permutations(ii);
7486  local_assigned_part_ids(k) = i + output_part_begin_index;
7487  });
7488  });
7489 
7490  if(is_data_ever_migrated) {
7491 #ifdef ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
7492  if(sizeof(mj_lno_t) <= sizeof(int)) {
7493 
7494  // Cannot use Zoltan_Comm with local ordinals larger than ints.
7495  // In Zoltan_Comm_Create, the cast int(this->num_local_coords)
7496  // may overflow.
7497 
7498  // if data is migrated, then send part numbers to the original owners.
7499  ZOLTAN_COMM_OBJ *plan = NULL;
7500  MPI_Comm mpi_comm = Teuchos::getRawMpiComm(*(this->mj_problemComm));
7501 
7502  int incoming = 0;
7503  int message_tag = 7856;
7504 
7505  this->mj_env->timerStart(MACRO_TIMERS,
7506  mj_timer_base_string + "Final Z1PlanCreating");
7507 
7508  // setup incoming count
7509  int ierr = Zoltan_Comm_Create( &plan, int(this->num_local_coords),
7510  this->owner_of_coordinate.data(), mpi_comm, message_tag, &incoming);
7511 
7512  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
7513  this->mj_env->timerStop(MACRO_TIMERS,
7514  mj_timer_base_string + "Final Z1PlanCreating" );
7515 
7516  this->mj_env->timerStart(MACRO_TIMERS,
7517  mj_timer_base_string + "Final Z1PlanComm");
7518 
7519  // MPI Buffers should be on Kokkos::HostSpace not Kokkos::CudaUVMSpace
7520  // Note, with UVM space, create_mirror_view does NOT create a non-UVM
7521  // view; need the explicit Host creation and deep_copy.
7522 
7523  // migrate gnos to actual owners.
7524  auto host_current_mj_gnos = Kokkos::create_mirror_view(
7525  Kokkos::HostSpace(), this->current_mj_gnos);
7526  deep_copy(host_current_mj_gnos, this->current_mj_gnos);
7527  Kokkos::View<mj_gno_t*, device_t> dst_gnos(
7528  Kokkos::ViewAllocateWithoutInitializing("dst_gnos"), incoming);
7529  auto host_dst_gnos = Kokkos::create_mirror_view(
7530  Kokkos::HostSpace(), dst_gnos);
7531  message_tag++;
7532  ierr = Zoltan_Comm_Do( plan, message_tag,
7533  (char *) host_current_mj_gnos.data(),
7534  sizeof(mj_gno_t), (char *) host_dst_gnos.data());
7535  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
7536  Kokkos::deep_copy(dst_gnos, host_dst_gnos);
7537  this->current_mj_gnos = dst_gnos;
7538 
7539  // migrate part ids to actual owners.
7540  auto host_src_part_ids = Kokkos::create_mirror_view(
7541  Kokkos::HostSpace(), this->assigned_part_ids);
7542  deep_copy(host_src_part_ids, this->assigned_part_ids);
7543  Kokkos::View<mj_part_t*, device_t> dst_part_ids(
7544  Kokkos::ViewAllocateWithoutInitializing("dst_part_ids"), incoming);
7545  auto host_dst_part_ids = Kokkos::create_mirror_view(
7546  Kokkos::HostSpace(), dst_part_ids);
7547  message_tag++;
7548  ierr = Zoltan_Comm_Do( plan, message_tag,
7549  (char *) host_src_part_ids.data(),
7550  sizeof(mj_part_t), (char *) host_dst_part_ids.data());
7551  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
7552  Kokkos::deep_copy(dst_part_ids, host_dst_part_ids);
7553  this->assigned_part_ids = dst_part_ids;
7554 
7555  ierr = Zoltan_Comm_Destroy(&plan);
7556  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
7557 
7558  this->num_local_coords = incoming;
7559 
7560  this->mj_env->timerStop(MACRO_TIMERS,
7561  mj_timer_base_string + "Final Z1PlanComm");
7562  }
7563  else
7564 #endif // ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
7565  {
7566  // setup incoming count
7567  this->mj_env->timerStart(MACRO_TIMERS,
7568  mj_timer_base_string + "Final DistributorPlanCreating");
7569  Tpetra::Distributor distributor(this->mj_problemComm);
7570  ArrayView<const mj_part_t> owners_of_coords(
7571  this->owner_of_coordinate.data(), this->num_local_coords);
7572  mj_lno_t incoming = distributor.createFromSends(owners_of_coords);
7573  this->mj_env->timerStop(MACRO_TIMERS,
7574  mj_timer_base_string + "Final DistributorPlanCreating" );
7575 
7576  this->mj_env->timerStart(MACRO_TIMERS,
7577  mj_timer_base_string + "Final DistributorPlanComm");
7578 
7579  // migrate gnos to actual owners.
7580  // MPI buffers should be Kokkos::HostSpace, not Kokkos::CudaUVMSpace
7581  // Note, with UVM space, create_mirror_view does NOT create a non-UVM
7582  // view; need the explicit Host creation and deep_copy.
7583  Kokkos::View<mj_gno_t*, Kokkos::HostSpace> sent_gnos(
7584  Kokkos::ViewAllocateWithoutInitializing("sent_gnos"),
7585  this->current_mj_gnos.extent(0));
7586  Kokkos::deep_copy(sent_gnos, this->current_mj_gnos);
7587 
7588  Kokkos::View<mj_gno_t*, Kokkos::HostSpace> received_gnos(
7589  Kokkos::ViewAllocateWithoutInitializing("received_gnos"),
7590  incoming);
7591 
7592  distributor.doPostsAndWaits(sent_gnos, 1, received_gnos);
7593 
7594  this->current_mj_gnos = Kokkos::View<mj_gno_t*, device_t>(
7595  Kokkos::ViewAllocateWithoutInitializing("current_mj_gnos"), incoming);
7596 
7597  Kokkos::deep_copy(this->current_mj_gnos, received_gnos);
7598 
7599  // migrate part ids to actual owners.
7600  Kokkos::View<mj_part_t *, Kokkos::HostSpace> sent_partids(
7601  Kokkos::ViewAllocateWithoutInitializing("sent_partids"),
7602  this->assigned_part_ids.extent(0));
7603  Kokkos::deep_copy(sent_partids, this->assigned_part_ids);
7604 
7605  Kokkos::View<mj_part_t *, Kokkos::HostSpace> received_partids(
7606  Kokkos::ViewAllocateWithoutInitializing("received_partids"),
7607  incoming);
7608 
7609  distributor.doPostsAndWaits(sent_partids, 1, received_partids);
7610 
7611  this->assigned_part_ids =
7612  Kokkos::View<mj_part_t*, device_t>(
7613  Kokkos::ViewAllocateWithoutInitializing("assigned_part_ids"),
7614  incoming);
7615 
7616  Kokkos::deep_copy(this->assigned_part_ids, received_partids);
7617  this->num_local_coords = incoming;
7618 
7619  this->mj_env->timerStop(MACRO_TIMERS,
7620  mj_timer_base_string + "Final DistributorPlanComm");
7621  }
7622  }
7623 
7624  this->mj_env->timerStop(MACRO_TIMERS,
7625  mj_timer_base_string + "Part_Assignment");
7626 
7627  this->mj_env->timerStart(MACRO_TIMERS,
7628  mj_timer_base_string + "Solution_Part_Assignment");
7629 
7630  // ArrayRCP<mj_part_t> partId;
7631  // partId = arcp(this->assigned_part_ids, 0, this->num_local_coords, true);
7632 
7633  if(this->mj_keep_part_boxes) {
7634  this->kept_boxes = compute_global_box_boundaries(output_part_boxes);
7635  }
7636 
7637  this->mj_env->timerStop(MACRO_TIMERS,
7638  mj_timer_base_string + "Solution_Part_Assignment");
7639 }
7640 
7653 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
7654  typename mj_part_t, typename mj_node_t>
7657  bool distribute_points_on_cut_lines_,
7658  int max_concurrent_part_calculation_,
7659  int check_migrate_avoid_migration_option_,
7660  double minimum_migration_imbalance_,
7661  int migration_type_)
7662 {
7663  this->distribute_points_on_cut_lines = distribute_points_on_cut_lines_;
7664  this->max_concurrent_part_calculation = max_concurrent_part_calculation_;
7665  this->check_migrate_avoid_migration_option =
7666  check_migrate_avoid_migration_option_;
7667  this->minimum_migration_imbalance = minimum_migration_imbalance_;
7668  this->migration_type = migration_type_;
7669 }
7670 
7698 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
7699  typename mj_part_t, typename mj_node_t>
7702  const RCP<const Environment> &env,
7703  RCP<const Comm<int> > &problemComm,
7704  double imbalance_tolerance_,
7705  int num_teams_,
7706  size_t num_global_parts_,
7707  Kokkos::View<mj_part_t*, Kokkos::HostSpace> & part_no_array_,
7708  int recursion_depth_,
7709  int coord_dim_,
7710  mj_lno_t num_local_coords_,
7711  mj_gno_t num_global_coords_,
7712  Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos_,
7713  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
7714  Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> & mj_coordinates_,
7715  int num_weights_per_coord_,
7716  Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_weights_,
7717  Kokkos::View<mj_scalar_t**, device_t> & mj_weights_,
7718  Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_parts_,
7719  Kokkos::View<mj_part_t *, device_t> & result_assigned_part_ids_,
7720  Kokkos::View<mj_gno_t*, device_t> & result_mj_gnos_)
7721 {
7722 
7723  // see comment above for Zoltan2_AlgMJ_TrackCallsCounter
7725  this->mj_timer_base_string = "MJ(" + std::to_string(execute_counter) + ") - ";
7726 
7727  this->mj_env = env;
7728  this->mj_problemComm = problemComm;
7729  this->myActualRank = this->myRank = this->mj_problemComm->getRank();
7730  this->mj_env->timerStart(MACRO_TIMERS,
7731  mj_timer_base_string + "Total");
7732  this->mj_env->debug(3, "In MultiJagged Jagged");
7733  this->imbalance_tolerance = imbalance_tolerance_;
7734  this->mj_num_teams = num_teams_;
7735  this->num_global_parts = num_global_parts_;
7736  this->part_no_array = part_no_array_;
7737  this->recursion_depth = recursion_depth_;
7738  this->coord_dim = coord_dim_;
7739  this->num_local_coords = num_local_coords_;
7740  this->num_global_coords = num_global_coords_;
7741  this->mj_coordinates = mj_coordinates_;
7742  this->initial_mj_gnos = initial_mj_gnos_;
7743  this->num_weights_per_coord = num_weights_per_coord_;
7744  this->mj_uniform_weights = mj_uniform_weights_;
7745  this->mj_weights = mj_weights_;
7746  this->mj_uniform_parts = mj_uniform_parts_;
7747 
7748  // this->set_input_data();
7749 
7750  this->set_part_specifications();
7751 
7752  this->mj_env->timerStart(MACRO_TIMERS,
7753  mj_timer_base_string + "Allocate Views");
7754  this->allocate_set_work_memory();
7755  this->mj_env->timerStop(MACRO_TIMERS,
7756  mj_timer_base_string + "Allocate Views");
7757 
7758  // We duplicate the comm as we create subcommunicators during migration.
7759  // We keep the problemComm as it is, while comm changes after each migration.
7760  this->comm = this->mj_problemComm->duplicate();
7761 
7762 #ifdef print_debug
7763  if(comm->getRank() == 0) {
7764  std::cout << "size of gno:" << sizeof(mj_gno_t) << std::endl;
7765  std::cout << "size of lno:" << sizeof(mj_lno_t) << std::endl;
7766  std::cout << "size of mj_scalar_t:" << sizeof(mj_scalar_t) << std::endl;
7767  }
7768 #endif
7769 
7770  // initially there is a single partition
7771  mj_part_t current_num_parts = 1;
7772  Kokkos::View<mj_scalar_t *, device_t> current_cut_coordinates =
7773  this->all_cut_coordinates;
7774  this->mj_env->timerStart(MACRO_TIMERS,
7775  mj_timer_base_string + "Problem_Partitioning");
7776  mj_part_t output_part_begin_index = 0;
7777  mj_part_t future_num_parts = this->total_num_part;
7778  bool is_data_ever_migrated = false;
7779 
7780  std::vector<mj_part_t> *future_num_part_in_parts =
7781  new std::vector<mj_part_t> ();
7782  std::vector<mj_part_t> *next_future_num_parts_in_parts =
7783  new std::vector<mj_part_t> ();
7784 
7785  next_future_num_parts_in_parts->push_back(this->num_global_parts);
7786 
7787  RCP<mj_partBoxVector_t> input_part_boxes;
7788  RCP<mj_partBoxVector_t> output_part_boxes;
7789 
7790  if(this->mj_keep_part_boxes) {
7791  input_part_boxes = RCP<mj_partBoxVector_t>(new mj_partBoxVector_t(), true);
7792  output_part_boxes = RCP<mj_partBoxVector_t>(new mj_partBoxVector_t(), true);
7793  compute_global_box();
7794  this->init_part_boxes(output_part_boxes);
7795  }
7796 
7797  auto local_part_xadj = this->part_xadj;
7798 
7799  // Need a device counter - how best to allocate?
7800  // Putting this allocation in the loops is very costly so moved out here.
7801  Kokkos::View<mj_part_t*, device_t>
7802  view_rectilinear_cut_count("view_rectilinear_cut_count", 1);
7803  Kokkos::View<size_t*, device_t>
7804  view_total_reduction_size("view_total_reduction_size", 1);
7805 
7806  for(int i = 0; i < this->recursion_depth; ++i) {
7807 
7808  // convert i to string to be used for debugging purposes.
7809  std::string istring = std::to_string(i);
7810 
7811  // next_future_num_parts_in_parts will be as the size of outnumParts,
7812  // and this will hold how many more parts that each output part
7813  // should be divided. this array will also be used to determine the weight
7814  // ratios of the parts. swap the arrays to use iteratively.
7815  std::vector<mj_part_t> *tmpPartVect= future_num_part_in_parts;
7816  future_num_part_in_parts = next_future_num_parts_in_parts;
7817  next_future_num_parts_in_parts = tmpPartVect;
7818 
7819  // clear next_future_num_parts_in_parts array as
7820  // getPartitionArrays expects it to be empty.
7821  next_future_num_parts_in_parts->clear();
7822  if(this->mj_keep_part_boxes) {
7823  RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
7824  input_part_boxes = output_part_boxes;
7825  output_part_boxes = tmpPartBoxes;
7826  output_part_boxes->clear();
7827  }
7828 
7829  // returns the total no. of output parts for this dimension partitioning.
7830  mj_part_t output_part_count_in_dimension =
7831  this->update_part_num_arrays(
7832  future_num_part_in_parts,
7833  next_future_num_parts_in_parts,
7834  future_num_parts,
7835  current_num_parts,
7836  i,
7837  input_part_boxes,
7838  output_part_boxes, 1);
7839 
7840  // if the number of obtained parts equal to current number of parts,
7841  // skip this dimension. For example, this happens when 1 is given in the
7842  // input part array is given. P=4,5,1,2
7843  if(output_part_count_in_dimension == current_num_parts) {
7844  //still need to swap the input output arrays.
7845  tmpPartVect= future_num_part_in_parts;
7846  future_num_part_in_parts = next_future_num_parts_in_parts;
7847  next_future_num_parts_in_parts = tmpPartVect;
7848 
7849  if(this->mj_keep_part_boxes) {
7850  RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
7851  input_part_boxes = output_part_boxes;
7852  output_part_boxes = tmpPartBoxes;
7853  }
7854  continue;
7855  }
7856 
7857  // get the coordinate axis along which the partitioning will be done.
7858  int coordInd = i % this->coord_dim;
7859 
7860  Kokkos::View<mj_scalar_t *, device_t> mj_current_dim_coords =
7861  Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
7862 
7863  this->mj_env->timerStart(MACRO_TIMERS,
7864  mj_timer_base_string + "Problem_Partitioning_" + istring);
7865 
7866  // alloc Memory to point the indices
7867  // of the parts in the permutation array.
7868  this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
7869  "new part xadj", output_part_count_in_dimension);
7870 
7871  // the index where in the new_part_xadj will be written.
7872  mj_part_t output_part_index = 0;
7873 
7874  // whatever is written to output_part_index will be added with
7875  // output_coordinate_end_index so that the points will be shifted.
7876  mj_part_t output_coordinate_end_index = 0;
7877 
7878  mj_part_t current_work_part = 0;
7879  mj_part_t current_concurrent_num_parts =
7880  std::min(current_num_parts - current_work_part,
7881  this->max_concurrent_part_calculation);
7882 
7883  mj_part_t obtained_part_index = 0;
7884 
7885  auto host_process_local_min_max_coord_total_weight =
7886  Kokkos::create_mirror_view(process_local_min_max_coord_total_weight);
7887  auto host_global_min_max_coord_total_weight =
7888  Kokkos::create_mirror_view(global_min_max_coord_total_weight);
7889 
7890  // run for all available parts.
7891  for(; current_work_part < current_num_parts;
7892  current_work_part += current_concurrent_num_parts) {
7893 
7894  current_concurrent_num_parts =
7895  std::min(current_num_parts - current_work_part,
7896  this->max_concurrent_part_calculation);
7897 
7898  int bDoingWork_int; // Can't reduce on bool so use int
7899  auto local_device_num_partitioning_in_current_dim =
7900  device_num_partitioning_in_current_dim;
7901  Kokkos::parallel_reduce("Read bDoingWork",
7902  Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
7903  KOKKOS_LAMBDA(int dummy, int & set_single) {
7904  set_single = 0;
7905  for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
7906  if(local_device_num_partitioning_in_current_dim(
7907  current_work_part + kk) != 1) {
7908  set_single = 1;
7909  break;
7910  }
7911  }
7912  }, bDoingWork_int);
7913  bool bDoingWork = (bDoingWork_int != 0) ? true : false;
7914 
7915  this->mj_get_local_min_max_coord_totW(
7916  current_work_part,
7917  current_concurrent_num_parts,
7918  mj_current_dim_coords);
7919 
7920  // 1D partitioning
7921  if(bDoingWork) {
7922  // obtain global Min max of the part.
7923  this->mj_get_global_min_max_coord_totW(
7924  current_concurrent_num_parts,
7925  this->process_local_min_max_coord_total_weight,
7926  this->global_min_max_coord_total_weight);
7927 
7928  // represents the total number of cutlines
7929  // whose coordinate should be determined.
7930  mj_part_t total_incomplete_cut_count = 0;
7931 
7932  // Compute weight ratios for parts & cuts:
7933  // e.g., 0.25 0.25 0.5 0.5 0.75 0.75 1
7934  // part0 cut0 part1 cut1 part2 cut2 part3
7935  mj_part_t concurrent_part_cut_shift = 0;
7936  mj_part_t concurrent_part_part_shift = 0;
7937 
7938  for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
7939 
7940  Kokkos::deep_copy(host_global_min_max_coord_total_weight,
7941  global_min_max_coord_total_weight);
7942 
7943  mj_scalar_t min_coordinate =
7944  host_global_min_max_coord_total_weight(kk);
7945  mj_scalar_t max_coordinate =
7946  host_global_min_max_coord_total_weight(
7947  kk + current_concurrent_num_parts);
7948 
7949  mj_scalar_t global_total_weight =
7950  host_global_min_max_coord_total_weight(
7951  kk + 2 * current_concurrent_num_parts);
7952 
7953  mj_part_t concurrent_current_part_index = current_work_part + kk;
7954 
7955  mj_part_t partition_count = host_num_partitioning_in_current_dim(
7956  concurrent_current_part_index);
7957 
7958  Kokkos::View<mj_scalar_t *, device_t> usedCutCoordinate =
7959  Kokkos::subview(current_cut_coordinates,
7960  std::pair<mj_lno_t, mj_lno_t>(
7961  concurrent_part_cut_shift, current_cut_coordinates.size()));
7962  Kokkos::View<mj_scalar_t *, device_t>
7963  current_target_part_weights =
7964  Kokkos::subview(target_part_weights,
7965  std::pair<mj_lno_t, mj_lno_t>(
7966  concurrent_part_part_shift, target_part_weights.size()));
7967 
7968  // shift the usedCutCoordinate array as noCuts.
7969  concurrent_part_cut_shift += partition_count - 1;
7970  // shift the partRatio array as noParts.
7971  concurrent_part_part_shift += partition_count;
7972 
7973  // calculate only if part is not empty,
7974  // and part will be further partitioned.
7975  if(partition_count > 1 && min_coordinate <= max_coordinate) {
7976 
7977  // increase num_cuts_do_be_determined by the number of cuts of the
7978  // current part's cut line number.
7979  total_incomplete_cut_count += partition_count - 1;
7980 
7981  this->incomplete_cut_count(kk) = partition_count - 1;
7982 
7983  // get the target weights of the parts
7984  this->mj_get_initial_cut_coords_target_weights(
7985  min_coordinate,
7986  max_coordinate,
7987  partition_count - 1,
7988  global_total_weight,
7989  usedCutCoordinate,
7990  current_target_part_weights,
7991  future_num_part_in_parts,
7992  next_future_num_parts_in_parts,
7993  concurrent_current_part_index,
7994  obtained_part_index);
7995 
7996  mj_lno_t coordinate_end_index =
7997  host_part_xadj(concurrent_current_part_index);
7998  mj_lno_t coordinate_begin_index =
7999  concurrent_current_part_index==0 ? 0 :
8000  host_part_xadj(concurrent_current_part_index - 1);
8001 
8002  this->set_initial_coordinate_parts(
8003  max_coordinate,
8004  min_coordinate,
8005  coordinate_begin_index, coordinate_end_index,
8006  this->coordinate_permutations,
8007  mj_current_dim_coords,
8008  this->assigned_part_ids,
8009  partition_count);
8010  }
8011  else {
8012  // e.g., if have fewer coordinates than parts, don't need to do
8013  // next dim.
8014  this->incomplete_cut_count(kk) = 0;
8015  }
8016 
8017  obtained_part_index += partition_count;
8018  }
8019 
8020  // used imbalance, it is always 0, as it is difficult to
8021  // estimate a range.
8022  double used_imbalance = 0;
8023  // Determine cut lines for all concurrent parts parts here.
8024  this->mj_env->timerStart(MACRO_TIMERS,
8025  mj_timer_base_string + "Problem_Partitioning Get Part Weights");
8026 
8027  this->mj_1D_part(
8028  mj_current_dim_coords,
8029  used_imbalance,
8030  current_work_part,
8031  current_concurrent_num_parts,
8032  current_cut_coordinates,
8033  total_incomplete_cut_count,
8034  view_rectilinear_cut_count,
8035  view_total_reduction_size);
8036 
8037  this->mj_env->timerStop(MACRO_TIMERS,
8038  mj_timer_base_string + "Problem_Partitioning Get Part Weights");
8039  }
8040 
8041  // create new part chunks
8042  {
8043  mj_part_t output_array_shift = 0;
8044  mj_part_t cut_shift = 0;
8045  size_t tlr_shift = 0;
8046  size_t partweight_array_shift = 0;
8047  for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
8048 
8049  mj_part_t current_concurrent_work_part = current_work_part + kk;
8050 
8051  mj_part_t num_parts = host_num_partitioning_in_current_dim(
8052  current_concurrent_work_part);
8053 
8054  // if the part is empty, skip the part.
8055  int coordinateA_bigger_than_coordinateB =
8056  host_global_min_max_coord_total_weight(kk) >
8057  host_global_min_max_coord_total_weight(
8058  kk + current_concurrent_num_parts);
8059 
8060  if((num_parts != 1) && coordinateA_bigger_than_coordinateB) {
8061  // we still need to write the begin and end point of the empty part.
8062  // simply set it zero, the array indices will be shifted later
8063  auto local_new_part_xadj = this->new_part_xadj;
8064  Kokkos::parallel_for(
8065  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
8066  (0, num_parts), KOKKOS_LAMBDA (mj_part_t jj) {
8067  local_new_part_xadj(
8068  output_part_index + output_array_shift + jj) = 0;
8069  });
8070 
8071  cut_shift += num_parts - 1;
8072  tlr_shift += (4 *(num_parts - 1) + 1);
8073  output_array_shift += num_parts;
8074  partweight_array_shift += (2 * (num_parts - 1) + 1);
8075  continue;
8076  }
8077 
8078  Kokkos::View<mj_scalar_t *, device_t>
8079  current_concurrent_cut_coordinate =
8080  Kokkos::subview(current_cut_coordinates,
8081  std::pair<mj_lno_t, mj_lno_t>(
8082  cut_shift,
8083  current_cut_coordinates.size()));
8084  Kokkos::View<mj_scalar_t *, device_t>
8085  used_local_cut_line_weight_to_left =
8086  Kokkos::subview(process_cut_line_weight_to_put_left,
8087  std::pair<mj_lno_t, mj_lno_t>(
8088  cut_shift,
8089  process_cut_line_weight_to_put_left.size()));
8090 
8091  this->thread_part_weight_work =
8092  Kokkos::subview(
8093  this->thread_part_weights,
8094  std::pair<mj_lno_t, mj_lno_t>(
8095  partweight_array_shift,
8096  this->thread_part_weights.extent(0)));
8097 
8098  if(num_parts > 1) {
8099  if(this->mj_keep_part_boxes) {
8100  // if part boxes are to be stored update the boundaries.
8101  for(mj_part_t j = 0; j < num_parts - 1; ++j) {
8102  mj_scalar_t temp_get_val;
8103  Kokkos::parallel_reduce("Read single",
8104  Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
8105  KOKKOS_LAMBDA(int dummy, mj_scalar_t & set_single) {
8106  set_single = current_concurrent_cut_coordinate(j);
8107  }, temp_get_val);
8108  (*output_part_boxes)
8109  [output_array_shift + output_part_index + j].
8110  updateMinMax(temp_get_val, 1 /*update max*/, coordInd);
8111  (*output_part_boxes)
8112  [output_array_shift + output_part_index + j + 1].
8113  updateMinMax(temp_get_val, 0 /*update max*/, coordInd);
8114  }
8115  }
8116 
8117  // Rewrite the indices based on the computed cuts.
8118  Kokkos::View<mj_lno_t*, device_t> sub_new_part_xadj =
8119  Kokkos::subview(this->new_part_xadj,
8120  std::pair<mj_lno_t, mj_lno_t>(
8121  output_part_index + output_array_shift,
8122  this->new_part_xadj.size()));
8123 
8124  this->mj_create_new_partitions(
8125  num_parts,
8126  current_concurrent_work_part,
8127  mj_current_dim_coords,
8128  current_concurrent_cut_coordinate,
8129  used_local_cut_line_weight_to_left,
8130  sub_new_part_xadj);
8131  }
8132  else {
8133 
8134  mj_lno_t coordinate_end = host_part_xadj(
8135  current_concurrent_work_part);
8136  mj_lno_t coordinate_begin =
8137  current_concurrent_work_part==0 ? 0 : host_part_xadj(
8138  current_concurrent_work_part - 1);
8139 
8140  // if this part is partitioned into 1 then just copy
8141  // the old values.
8142  mj_lno_t part_size = coordinate_end - coordinate_begin;
8143 
8144  // Awkward here to set one value - need some broader
8145  // refactoring to improve this one.
8146  auto local_new_part_xadj = this->new_part_xadj;
8147  Kokkos::parallel_for(
8148  Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
8149  (0, 1), KOKKOS_LAMBDA (int dummy) {
8150  local_new_part_xadj(
8151  output_part_index + output_array_shift) = part_size;
8152  });
8153 
8154  auto subview_new_coordinate_permutations =
8155  Kokkos::subview(this->new_coordinate_permutations,
8156  std::pair<mj_lno_t, mj_lno_t>(
8157  coordinate_begin,
8158  coordinate_begin + part_size));
8159  auto subview_coordinate_permutations =
8160  Kokkos::subview(this->coordinate_permutations,
8161  std::pair<mj_lno_t, mj_lno_t>(
8162  coordinate_begin,
8163  coordinate_begin + part_size));
8164  Kokkos::deep_copy(subview_new_coordinate_permutations,
8165  subview_coordinate_permutations);
8166  }
8167  cut_shift += num_parts - 1;
8168  output_array_shift += num_parts;
8169  partweight_array_shift += (2 * (num_parts - 1) + 1);
8170  }
8171 
8172  // shift cut coordinates so that all cut coordinates are stored.
8173  // no shift now because we dont keep the cuts.
8174  // current_cut_coordinates += cut_shift;
8175  // mj_create_new_partitions from coordinates partitioned the parts
8176  // and write the indices as if there were a single part.
8177  // now we need to shift the beginning indices.
8178  for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
8179  mj_part_t num_parts =
8180  host_num_partitioning_in_current_dim(current_work_part + kk);
8181 
8182  // These two kernels are a bit awkward but need broader redesign to
8183  // avoid this situation.
8184  auto local_new_part_xadj = this->new_part_xadj;
8185  Kokkos::parallel_for(
8186  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
8187  (0, num_parts), KOKKOS_LAMBDA (mj_part_t ii) {
8188  local_new_part_xadj(output_part_index+ii) +=
8189  output_coordinate_end_index;
8190  });
8191 
8192  // increase the previous count by current end.
8193  mj_part_t temp_get;
8194  Kokkos::parallel_reduce("Read single",
8195  Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
8196  KOKKOS_LAMBDA(int dummy, mj_part_t & set_single) {
8197  set_single =
8198  local_new_part_xadj(output_part_index + num_parts - 1);
8199  }, temp_get);
8200  output_coordinate_end_index = temp_get;
8201  //increase the current out.
8202  output_part_index += num_parts;
8203  }
8204  }
8205  }
8206 
8207  // end of this partitioning dimension
8208  int current_world_size = this->comm->getSize();
8209  long migration_reduce_all_population =
8210  this->total_dim_num_reduce_all * current_world_size;
8211  bool is_migrated_in_current_dimension = false;
8212 
8213  // we migrate if there are more partitionings to be done after this step
8214  // and if the migration is not forced to be avoided.
8215  // and the operation is not sequential.
8216  if(future_num_parts > 1 &&
8217  this->check_migrate_avoid_migration_option >= 0 &&
8218  current_world_size > 1) {
8219  this->mj_env->timerStart(MACRO_TIMERS,
8220  mj_timer_base_string + "Problem_Migration-" + istring);
8221  mj_part_t num_parts = output_part_count_in_dimension;
8222 
8223  if(this->mj_perform_migration(
8224  num_parts,
8225  current_num_parts, //output
8226  next_future_num_parts_in_parts, //output
8227  output_part_begin_index,
8228  migration_reduce_all_population,
8229  this->num_global_coords / (future_num_parts * current_num_parts),
8230  istring,
8231  input_part_boxes, output_part_boxes) )
8232  {
8233  is_migrated_in_current_dimension = true;
8234  is_data_ever_migrated = true;
8235  this->mj_env->timerStop(MACRO_TIMERS,
8236  mj_timer_base_string + "Problem_Migration-" + istring);
8237  // since data is migrated, we reduce the number of reduceAll
8238  // operations for the last part.
8239  this->total_dim_num_reduce_all /= num_parts;
8240  }
8241  else {
8242  is_migrated_in_current_dimension = false;
8243  this->mj_env->timerStop(MACRO_TIMERS,
8244  mj_timer_base_string + "Problem_Migration-" + istring);
8245  }
8246  }
8247 
8248  // swap the coordinate permutations for the next dimension.
8249  Kokkos::View<mj_lno_t*, device_t> tmp =
8250  this->coordinate_permutations;
8251  this->coordinate_permutations =
8252  this->new_coordinate_permutations;
8253 
8254  this->new_coordinate_permutations = tmp;
8255  if(!is_migrated_in_current_dimension) {
8256  this->total_dim_num_reduce_all -= current_num_parts;
8257  current_num_parts = output_part_count_in_dimension;
8258  }
8259 
8260  {
8261  this->part_xadj = this->new_part_xadj;
8262  local_part_xadj = this->new_part_xadj;
8263  this->host_part_xadj = Kokkos::create_mirror_view(part_xadj);
8264  Kokkos::deep_copy(host_part_xadj, part_xadj); // keep in sync
8265 
8266  this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>("empty", 0);
8267  this->mj_env->timerStop(MACRO_TIMERS,
8268  mj_timer_base_string + "Problem_Partitioning_" + istring);
8269  }
8270  }
8271 
8272  // Partitioning is done
8273  delete future_num_part_in_parts;
8274  delete next_future_num_parts_in_parts;
8275  this->mj_env->timerStop(MACRO_TIMERS,
8276  mj_timer_base_string + "Problem_Partitioning");
8278 
8279  //get the final parts of each initial coordinate
8280  //the results will be written to
8281  //this->assigned_part_ids for gnos given in this->current_mj_gnos
8282  this->set_final_parts(
8283  current_num_parts,
8284  output_part_begin_index,
8285  output_part_boxes,
8286  is_data_ever_migrated);
8287 
8288  result_assigned_part_ids_ = this->assigned_part_ids;
8289  result_mj_gnos_ = this->current_mj_gnos;
8290  this->mj_env->timerStop(MACRO_TIMERS,
8291  mj_timer_base_string + "Total");
8292  this->mj_env->debug(3, "Out of MultiJagged");
8293 }
8294 
8295 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
8296  typename mj_part_t, typename mj_node_t>
8297 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t, mj_node_t>::
8298  mj_partBoxVector_t>
8301 {
8302  if(this->mj_keep_part_boxes) {
8303  return this->kept_boxes;
8304  }
8305  else {
8306  throw std::logic_error("Error: part boxes are not stored.");
8307  }
8308 }
8309 
8310 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
8311  typename mj_part_t, typename mj_node_t>
8312 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t, mj_node_t>::
8313  mj_partBoxVector_t>
8315  compute_global_box_boundaries(RCP<mj_partBoxVector_t> &localPartBoxes) const
8316 {
8317  typedef typename Zoltan2::coordinateModelPartBox::coord_t coord_t;
8318  mj_part_t ntasks = this->num_global_parts;
8319  int dim = (*localPartBoxes)[0].getDim();
8320  coord_t *localPartBoundaries = new coord_t[ntasks * 2 *dim];
8321 
8322  memset(localPartBoundaries, 0, sizeof(coord_t) * ntasks * 2 *dim);
8323 
8324  coord_t *globalPartBoundaries = new coord_t[ntasks * 2 *dim];
8325  memset(globalPartBoundaries, 0, sizeof(coord_t) * ntasks * 2 *dim);
8326 
8327  coord_t *localPartMins = localPartBoundaries;
8328  coord_t *localPartMaxs = localPartBoundaries + ntasks * dim;
8329 
8330  coord_t *globalPartMins = globalPartBoundaries;
8331  coord_t *globalPartMaxs = globalPartBoundaries + ntasks * dim;
8332 
8333  mj_part_t boxCount = localPartBoxes->size();
8334  for(mj_part_t i = 0; i < boxCount; ++i) {
8335  mj_part_t pId = (*localPartBoxes)[i].getpId();
8336 
8337  // cout << "me:" << comm->getRank() << " has:" << pId << endl;
8338 
8339  coord_t *lmins = (*localPartBoxes)[i].getlmins();
8340  coord_t *lmaxs = (*localPartBoxes)[i].getlmaxs();
8341 
8342  for(int j = 0; j < dim; ++j) {
8343  localPartMins[dim * pId + j] = lmins[j];
8344  localPartMaxs[dim * pId + j] = lmaxs[j];
8345 
8346  /*
8347  std::cout << "me:" << comm->getRank() <<
8348  " dim * pId + j:"<< dim * pId + j <<
8349  " localMin:" << localPartMins[dim * pId + j] <<
8350  " localMax:" << localPartMaxs[dim * pId + j] << std::endl;
8351  */
8352  }
8353  }
8354 
8355  Teuchos::Zoltan2_BoxBoundaries<int, coord_t> reductionOp(ntasks * 2 *dim);
8356 
8357  reduceAll<int, coord_t>(*mj_problemComm, reductionOp,
8358  ntasks * 2 *dim, localPartBoundaries, globalPartBoundaries);
8359 
8360  RCP<mj_partBoxVector_t> pB(new mj_partBoxVector_t(),true);
8361  for(mj_part_t i = 0; i < ntasks; ++i) {
8363  globalPartMins + dim * i,
8364  globalPartMaxs + dim * i);
8365 
8366  /*
8367  for(int j = 0; j < dim; ++j) {
8368  std::cout << "me:" << comm->getRank() <<
8369  " dim * pId + j:"<< dim * i + j <<
8370  " globalMin:" << globalPartMins[dim * i + j] <<
8371  " globalMax:" << globalPartMaxs[dim * i + j] << std::endl;
8372  }
8373  */
8374 
8375  pB->push_back(tpb);
8376  }
8377  delete []localPartBoundaries;
8378  delete []globalPartBoundaries;
8379  //RCP <mj_partBoxVector_t> tmpRCPBox(pB, true);
8380  return pB;
8381 }
8382 
8385 template <typename Adapter>
8386 class Zoltan2_AlgMJ : public Algorithm<Adapter>
8387 {
8388 
8389 private:
8390 
8391 #ifndef DOXYGEN_SHOULD_SKIP_THIS
8392  // For coordinates and weights, MJ needs floats or doubles
8393  // But Adapter can provide other scalars, e.g., ints.
8394  // So have separate scalar_t for MJ and adapter.
8395  typedef typename Adapter::scalar_t adapter_scalar_t;
8396 
8397  // Provide a default type for mj_scalar_t;
8398  typedef float default_mj_scalar_t;
8399 
8400  // If Adapter provided float or double scalar_t, use it (prevents copies).
8401  // Otherwise, use the default type of mj_scalar_t;
8402  typedef typename
8403  std::conditional<
8404  (std::is_same<adapter_scalar_t, float>::value ||
8405  std::is_same<adapter_scalar_t, double>::value),
8406  adapter_scalar_t, default_mj_scalar_t>::type mj_scalar_t;
8407 
8408  typedef typename Adapter::gno_t mj_gno_t;
8409  typedef typename Adapter::lno_t mj_lno_t;
8410  typedef typename Adapter::part_t mj_part_t;
8411  typedef typename Adapter::node_t mj_node_t;
8412  typedef coordinateModelPartBox mj_partBox_t;
8413  typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
8414  typedef typename mj_node_t::device_type device_t;
8415 #endif
8416 
8418 
8419  RCP<const Environment> mj_env; // the environment object
8420  RCP<const Comm<int> > mj_problemComm; // initial comm object
8421  RCP<const typename Adapter::base_adapter_t> mj_adapter; // coordinate adapter
8422 
8423  // PARAMETERS
8424  double imbalance_tolerance; // input imbalance tolerance.
8425 
8426  int num_teams; // how many teams to run main loop with
8427 
8428  size_t num_global_parts; // the targeted number of parts
8429 
8430  // input part array specifying num part to divide along each dim.
8431  Kokkos::View<mj_part_t*, Kokkos::HostSpace> part_no_array;
8432 
8433  // the number of steps that partitioning will be solved in.
8434  int recursion_depth;
8435 
8436  int coord_dim; // coordinate dimension.
8437  mj_lno_t num_local_coords; //number of local coords.
8438  mj_gno_t num_global_coords; //number of global coords.
8439 
8440  // initial global ids of the coordinates.
8441  Kokkos::View<const mj_gno_t*, device_t> initial_mj_gnos;
8442 
8443  // two dimension coordinate array.
8444  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8445  Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
8446  mj_coordinates;
8447 
8448  int num_weights_per_coord; // number of weights per coordinate
8449 
8450  // if the target parts are uniform.
8451  Kokkos::View<bool*, Kokkos::HostSpace> mj_uniform_weights;
8452 
8453  // two dimensional weight array.
8454  Kokkos::View<mj_scalar_t**, device_t> mj_weights;
8455 
8456  // if the target parts are uniform
8457  Kokkos::View<bool*, Kokkos::HostSpace> mj_uniform_parts;
8458 
8459  // Nonuniform first level partitioning
8460  // Currently used for Dragonfly task mapping by partitioning Dragonfly RCA
8461  // machine coordinates and application coordinates.
8462  // An optimization that completely partitions the most important machine
8463  // dimension first (i.e. the Dragonfly group coordinate, or RCA's x
8464  // coordinate). The standard MJ alg follows after the nonuniform first level
8465  // partitioning.
8466  // If used, number of parts for the first level partitioning
8467  mj_part_t num_first_level_parts;
8468 
8469  // If used, the distribution of parts for the nonuniform
8470  // first level partitioning
8471  Kokkos::View<mj_part_t*, Kokkos::HostSpace> first_level_distribution;
8472 
8473  // if partitioning can distribute points on same coordiante to
8474  // different parts.
8475  bool distribute_points_on_cut_lines;
8476 
8477  // how many parts we can calculate concurrently.
8478  mj_part_t max_concurrent_part_calculation;
8479 
8480  // whether to migrate=1, avoid migrate=2, or leave decision to MJ=0
8481  int check_migrate_avoid_migration_option;
8482 
8483  // when doing the migration, 0 will aim for perfect load-imbalance,
8484  int migration_type;
8485 
8486  // 1 for minimized messages
8487 
8488  // when MJ decides whether to migrate, the minimum imbalance for migration.
8489  double minimum_migration_imbalance;
8490  bool mj_keep_part_boxes; //if the boxes need to be kept.
8491 
8492  // if this is set, then recursion depth is adjusted to its maximum value.
8493  bool mj_run_as_rcb;
8494  int mj_premigration_option;
8495  int min_coord_per_rank_for_premigration;
8496 
8497  // communication graph xadj
8498  ArrayRCP<mj_part_t> comXAdj_;
8499 
8500  // communication graph adj.
8501  ArrayRCP<mj_part_t> comAdj_;
8502 
8503  void copy(
8504  const RCP<PartitioningSolution<Adapter> >&solution);
8505 
8506  void set_input_parameters(const Teuchos::ParameterList &p);
8507 
8508  RCP<mj_partBoxVector_t> getGlobalBoxBoundaries() const;
8509 
8510  bool mj_premigrate_to_subset(
8511  int used_num_ranks,
8512  int migration_selection_option,
8513  RCP<const Environment> mj_env_,
8514  RCP<const Comm<int> > mj_problemComm_,
8515  int coord_dim_,
8516  mj_lno_t num_local_coords_,
8517  mj_gno_t num_global_coords_, size_t num_global_parts_,
8518  Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos_,
8519  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8520  Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> &
8521  mj_coordinates_,
8522  int num_weights_per_coord_,
8523  Kokkos::View<mj_scalar_t**, device_t> & mj_weights_,
8524  //results
8525  RCP<const Comm<int> > &result_problemComm_,
8526  mj_lno_t & result_num_local_coords_,
8527  Kokkos::View<mj_gno_t*, device_t> & result_initial_mj_gnos_,
8528  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8529  Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> &
8530  result_mj_coordinates_,
8531  Kokkos::View<mj_scalar_t**, device_t> & result_mj_weights_,
8532  int * &result_actual_owner_rank_);
8533 
8534 public:
8535 
8536  Zoltan2_AlgMJ(const RCP<const Environment> &env,
8537  RCP<const Comm<int> > &problemComm,
8538  const RCP<const typename Adapter::base_adapter_t> &adapter) :
8539  mj_partitioner(),
8540  mj_env(env),
8541  mj_problemComm(problemComm),
8542  mj_adapter(adapter),
8543  imbalance_tolerance(0),
8544  num_teams(0),
8545  num_global_parts(1),
8546  recursion_depth(0),
8547  coord_dim(0),
8548  num_local_coords(0),
8549  num_global_coords(0),
8550  num_weights_per_coord(0),
8551  num_first_level_parts(1),
8552  distribute_points_on_cut_lines(true),
8553  max_concurrent_part_calculation(1),
8554  check_migrate_avoid_migration_option(0),
8555  migration_type(0),
8556  minimum_migration_imbalance(0.30),
8557  mj_keep_part_boxes(false),
8558  mj_run_as_rcb(false),
8559  mj_premigration_option(0),
8560  min_coord_per_rank_for_premigration(32000),
8561  comXAdj_(),
8562  comAdj_()
8563  {
8564  }
8565 
8567  {
8568  }
8569 
8572  static void getValidParameters(ParameterList & pl)
8573  {
8574  const bool bUnsorted = true; // this clarifies the flag is for unsrorted
8575  RCP<Zoltan2::IntegerRangeListValidator<int>> mj_parts_Validator =
8576  Teuchos::rcp( new Zoltan2::IntegerRangeListValidator<int>(bUnsorted) );
8577  pl.set("mj_parts", "0", "list of parts for multiJagged partitioning "
8578  "algorithm. As many as the dimension count.", mj_parts_Validator);
8579 
8580  pl.set("mj_concurrent_part_count", 1, "The number of parts whose cut "
8581  "coordinates will be calculated concurently.",
8583 
8584  pl.set("mj_minimum_migration_imbalance", 1.1,
8585  "mj_minimum_migration_imbalance, the minimum imbalance of the "
8586  "processors to avoid migration",
8588 
8589  RCP<Teuchos::EnhancedNumberValidator<int>> mj_migration_option_validator =
8590  Teuchos::rcp( new Teuchos::EnhancedNumberValidator<int>(0, 2) );
8591  pl.set("mj_migration_option", 1, "Migration option, 0 for decision "
8592  "depending on the imbalance, 1 for forcing migration, 2 for "
8593  "avoiding migration", mj_migration_option_validator);
8594 
8595  RCP<Teuchos::EnhancedNumberValidator<int>> mj_migration_type_validator =
8596  Teuchos::rcp( new Teuchos::EnhancedNumberValidator<int>(0, 1) );
8597  pl.set("mj_migration_type", 0,
8598  "Migration type, 0 for migration to minimize the imbalance "
8599  "1 for migration to minimize messages exchanged the migration.",
8600  mj_migration_option_validator);
8601 
8602  // bool parameter
8603  pl.set("mj_keep_part_boxes", false, "Keep the part boundaries of the "
8604  "geometric partitioning.", Environment::getBoolValidator());
8605 
8606  // bool parameter
8607  pl.set("mj_enable_rcb", false, "Use MJ as RCB.",
8609 
8610  pl.set("mj_recursion_depth", -1, "Recursion depth for MJ: Must be "
8611  "greater than 0.", Environment::getAnyIntValidator());
8612 
8613  RCP<Teuchos::EnhancedNumberValidator<int>>
8614  mj_num_teams_validator =
8615  Teuchos::rcp( new Teuchos::EnhancedNumberValidator<int>(
8616  0, Teuchos::EnhancedNumberTraits<int>::max()) );
8617  pl.set("mj_num_teams", 0,
8618  "How many teams for the main kernel loop"
8619  , mj_num_teams_validator);
8620 
8621  RCP<Teuchos::EnhancedNumberValidator<int>>
8622  mj_premigration_option_validator =
8623  Teuchos::rcp( new Teuchos::EnhancedNumberValidator<int>(0, 1024) );
8624 
8625  pl.set("mj_premigration_option", 0,
8626  "Whether to do premigration or not. 0 for no migration "
8627  "x > 0 for migration to consecutive processors, "
8628  "the subset will be 0,x,2x,3x,...subset ranks."
8629  , mj_premigration_option_validator);
8630 
8631  pl.set("mj_premigration_coordinate_count", 32000, "How many coordinate to "
8632  "assign each rank in multijagged after premigration"
8634  }
8635 
8641  void partition(const RCP<PartitioningSolution<Adapter> > &solution);
8642 
8643  mj_partBoxVector_t &getPartBoxesView() const
8644  {
8645  RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
8646  return *pBoxes;
8647  }
8648 
8649  mj_part_t pointAssign(int dim, adapter_scalar_t *point) const;
8650 
8651  void boxAssign(int dim, adapter_scalar_t *lower, adapter_scalar_t *upper,
8652  size_t &nPartsFound, mj_part_t **partsFound) const;
8653 
8656  void getCommunicationGraph(
8657  const PartitioningSolution<Adapter> *solution,
8658  ArrayRCP<mj_part_t> &comXAdj,
8659  ArrayRCP<mj_part_t> &comAdj);
8660 
8661  void set_up_partitioning_data( // public for CUDA
8662  const RCP<PartitioningSolution<Adapter> >&solution);
8663 
8664  private:
8665  std::string timer_base_string; // used for making timers
8666 
8667  // After loading views from coordinate adapter we may need to copy them
8668  // if mj type is different, but otherwise we just want to assign the view.
8669  // So purpose of this code is to make that assign only happen when the types
8670  // match. The empty case would otherwise not compile.
8671  // If they don't match the internal code handles allocating the new view
8672  // and copying the elements. See the test Zoltan2_mj_int_coordinates.
8673  template<class dst_t, class src_t> // version for same types
8674  typename std::enable_if<std::is_same<typename dst_t::value_type,
8675  typename src_t::value_type>::value>::type
8676  assign_if_same(dst_t & dst, const src_t & src) {
8677  dst = src;
8678  }
8679  template<class dst_t, class src_t> // version for different types
8680  typename std::enable_if<!std::is_same<typename dst_t::value_type,
8681  typename src_t::value_type>::value>::type
8682  assign_if_same(dst_t & dst, const src_t & src) {
8683  // do nothing - handled manually
8684  }
8685 };
8686 
8687 template <typename Adapter>
8688 bool Zoltan2_AlgMJ<Adapter>::mj_premigrate_to_subset(
8689  int used_num_ranks,
8690  int migration_selection_option,
8691  RCP<const Environment> mj_env_,
8692  RCP<const Comm<int> > mj_problemComm_,
8693  int coord_dim_,
8694  mj_lno_t num_local_coords_,
8695  mj_gno_t num_global_coords_, size_t num_global_parts_,
8696  Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos_,
8697  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8698  Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> & mj_coordinates_,
8699  int num_weights_per_coord_,
8700  Kokkos::View<mj_scalar_t**, device_t> & mj_weights_,
8701  //results
8702  RCP<const Comm<int> > & result_problemComm_,
8703  mj_lno_t &result_num_local_coords_,
8704  Kokkos::View<mj_gno_t*, device_t> & result_initial_mj_gnos_,
8705  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8706  Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> &
8707  result_mj_coordinates_,
8708  Kokkos::View<mj_scalar_t**, device_t> & result_mj_weights_,
8709  int * &result_actual_owner_rank_)
8710 {
8711  mj_env_->timerStart(MACRO_TIMERS,
8712  timer_base_string + "PreMigration DistributorPlanCreating");
8713 
8714  int myRank = mj_problemComm_->getRank();
8715  int worldSize = mj_problemComm_->getSize();
8716 
8717  mj_part_t groupsize = worldSize / used_num_ranks;
8718 
8719  std::vector<mj_part_t> group_begins(used_num_ranks + 1, 0);
8720 
8721  mj_part_t i_am_sending_to = 0;
8722  bool am_i_a_receiver = false;
8723 
8724  for(int i = 0; i < used_num_ranks; ++i) {
8725  group_begins[i+ 1] = group_begins[i] + groupsize;
8726  if(worldSize % used_num_ranks > i) group_begins[i+ 1] += 1;
8727  if(i == used_num_ranks) group_begins[i+ 1] = worldSize;
8728  if(myRank >= group_begins[i] && myRank < group_begins[i + 1]) {
8729  i_am_sending_to = group_begins[i];
8730  }
8731  if(myRank == group_begins[i]) {
8732  am_i_a_receiver = true;
8733  }
8734  }
8735 
8736  ArrayView<const mj_part_t> idView(&(group_begins[0]), used_num_ranks );
8737  result_problemComm_ = mj_problemComm_->createSubcommunicator(idView);
8738 
8739  Tpetra::Distributor distributor(mj_problemComm_);
8740 
8741  std::vector<mj_part_t>
8742  coordinate_destinations(num_local_coords_, i_am_sending_to);
8743 
8744  ArrayView<const mj_part_t>
8745  destinations(&(coordinate_destinations[0]), num_local_coords_);
8746  mj_lno_t num_incoming_gnos = distributor.createFromSends(destinations);
8747  result_num_local_coords_ = num_incoming_gnos;
8748  mj_env_->timerStop(MACRO_TIMERS,
8749  timer_base_string + "PreMigration DistributorPlanCreating");
8750 
8751  mj_env_->timerStart(MACRO_TIMERS,
8752  timer_base_string + "PreMigration DistributorMigration");
8753 
8754 
8755  // migrate gnos.
8756  // MPI buffers should be on Kokkos::HostSpace not Kokkos::CudaUVMSpace
8757  // Note, with UVM space, create_mirror_view does NOT create a non-UVM
8758  // view; need the explicit Host creation and deep_copy.
8759  {
8760  Kokkos::View<mj_gno_t*, Kokkos::HostSpace> sent_gnos(
8761  Kokkos::ViewAllocateWithoutInitializing("sent_gnos"),
8762  initial_mj_gnos_.size()); // initial_mj_gnos_ is const mj_gno_t *
8763  Kokkos::deep_copy(sent_gnos, initial_mj_gnos_);
8764 
8765  Kokkos::View<mj_gno_t*, Kokkos::HostSpace> received_gnos (
8766  Kokkos::ViewAllocateWithoutInitializing("received_gnos"),
8767  num_incoming_gnos);
8768 
8769  distributor.doPostsAndWaits(sent_gnos, 1, received_gnos);
8770 
8771  result_initial_mj_gnos_ = Kokkos::View<mj_gno_t*, device_t>(
8772  Kokkos::ViewAllocateWithoutInitializing("result_initial_mj_gnos_"),
8773  num_incoming_gnos);
8774  Kokkos::deep_copy(result_initial_mj_gnos_, received_gnos);
8775  }
8776 
8777  // migrate coordinates
8778  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8779 
8780  Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, Kokkos::HostSpace>
8781  host_src_coordinates(
8782  Kokkos::ViewAllocateWithoutInitializing("mj_coordinates"),
8783  this->mj_coordinates.extent(0), this->mj_coordinates.extent(1));
8784 
8785  Kokkos::deep_copy(host_src_coordinates, this->mj_coordinates);
8786 
8787  Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> dst_coordinates(
8788  Kokkos::ViewAllocateWithoutInitializing("mj_coordinates"),
8789  num_incoming_gnos, this->coord_dim);
8790 
8791  Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> received_coord(
8792  Kokkos::ViewAllocateWithoutInitializing("received_coord"),
8793  num_incoming_gnos);
8794 
8795  for(int i = 0; i < this->coord_dim; ++i) {
8796 
8797  auto sent_coord = Kokkos::subview(host_src_coordinates, Kokkos::ALL, i);
8798 
8799  distributor.doPostsAndWaits(sent_coord, 1, received_coord);
8800 
8801  Kokkos::deep_copy(Kokkos::subview(dst_coordinates, Kokkos::ALL, i),
8802  received_coord);
8803  Kokkos::fence();
8804  }
8805  result_mj_coordinates_ = dst_coordinates;
8806 
8807  // migrate weights.
8808 
8809  Kokkos::View<mj_scalar_t**, device_t> dst_weights(
8810  Kokkos::ViewAllocateWithoutInitializing("mj_weights"),
8811  num_incoming_gnos, this->num_weights_per_coord);
8812  auto host_dst_weights = Kokkos::create_mirror_view(dst_weights);
8813 
8814  auto host_src_weights = Kokkos::create_mirror_view_and_copy(
8815  Kokkos::HostSpace(), this->mj_weights);
8816 
8817  // contiguous buffers to gather potentially strided data
8818  Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> sent_weight(
8819  Kokkos::ViewAllocateWithoutInitializing("send_weight_buffer"),
8820  this->num_local_coords);
8821 
8822  Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> received_weight(
8823  Kokkos::ViewAllocateWithoutInitializing("received_weight_buffer"),
8824  num_incoming_gnos);
8825 
8826  for(int i = 0; i < this->num_weights_per_coord; ++i) {
8827 
8828  auto sub_host_src_weights
8829  = Kokkos::subview(host_src_weights, Kokkos::ALL, i);
8830  auto sub_host_dst_weights
8831  = Kokkos::subview(host_dst_weights, Kokkos::ALL, i);
8832 
8833  // Layout Right means these weights are not contiguous
8834  // However we don't have any systems setup with more than 1 weight so
8835  // really I have not tested any of this code with num weights > 1.
8836  // I think this is the right thing to do. Note that there are other
8837  // places in the code which don't handle the possibility of more weights.
8838  // So evaluating all that and adding tests would be another project.
8839  for(mj_lno_t n = 0; n < this->num_local_coords; ++n) {
8840  sent_weight[n] = sub_host_src_weights(n);
8841  }
8842 
8843  distributor.doPostsAndWaits(sent_weight, 1, received_weight);
8844 
8845  // Again we copy by index due to layout
8846  for(mj_lno_t n = 0; n < num_incoming_gnos; ++n) {
8847  sub_host_dst_weights(n) = received_weight[n];
8848  }
8849  }
8850  Kokkos::deep_copy(dst_weights, host_dst_weights);
8851  result_mj_weights_ = dst_weights;
8852 
8853  // migrate the owners of the coordinates
8854  {
8855  Kokkos::View<int*, Kokkos::HostSpace> sent_owners(
8856  Kokkos::ViewAllocateWithoutInitializing("sent_owners"),
8857  num_local_coords_);
8858  Kokkos::deep_copy(sent_owners, myRank);
8859 
8860  Kokkos::View<int*, Kokkos::HostSpace> received_owners(
8861  Kokkos::ViewAllocateWithoutInitializing("received_owners"),
8862  num_incoming_gnos);
8863 
8864  distributor.doPostsAndWaits(sent_owners, 1, received_owners);
8865 
8866  result_actual_owner_rank_ = new int[num_incoming_gnos];
8867  memcpy(
8868  result_actual_owner_rank_,
8869  received_owners.data(),
8870  num_incoming_gnos * sizeof(int));
8871  }
8872 
8873  mj_env_->timerStop(MACRO_TIMERS,
8874  timer_base_string + "PreMigration DistributorMigration");
8875  return am_i_a_receiver;
8876 }
8877 
8885 template <typename Adapter>
8887  const RCP<PartitioningSolution<Adapter> > &solution)
8888 {
8889  // purpose of this code is to validate node and UVM status for the tests
8890  // std::cout << "Memory Space: " << mj_node_t::memory_space::name() << " "
8891  // << "Execution Space: " << mj_node_t::execution_space::name()
8892  // << std::endl;
8893 
8894  int execute_counter =
8896  timer_base_string = "partition(" + std::to_string(execute_counter) + ") - ";
8897 
8898  this->mj_env->timerStart(MACRO_TIMERS, timer_base_string + "all");
8899  {
8900  this->mj_env->timerStart(MACRO_TIMERS, timer_base_string + "setup");
8901 
8902  this->set_up_partitioning_data(solution);
8903 
8904  this->set_input_parameters(this->mj_env->getParameters());
8905  if(this->mj_keep_part_boxes) {
8906  this->mj_partitioner.set_to_keep_part_boxes();
8907  }
8908 
8909  this->mj_partitioner.set_partitioning_parameters(
8910  this->distribute_points_on_cut_lines,
8911  this->max_concurrent_part_calculation,
8912  this->check_migrate_avoid_migration_option,
8913  this->minimum_migration_imbalance, this->migration_type);
8914 
8915  RCP<const Comm<int> > result_problemComm = this->mj_problemComm;
8916  mj_lno_t result_num_local_coords = this->num_local_coords;
8917  Kokkos::View<mj_gno_t*, device_t> result_initial_mj_gnos;
8918  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8919  Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
8920  result_mj_coordinates = this->mj_coordinates;
8921  Kokkos::View<mj_scalar_t**, device_t> result_mj_weights =
8922  this->mj_weights;
8923  int *result_actual_owner_rank = NULL;
8924 
8925  Kokkos::View<const mj_gno_t*, device_t> result_initial_mj_gnos_ =
8926  this->initial_mj_gnos;
8927 
8928  // TODO: MD 08/2017: Further discussion is required.
8929  // MueLu calls MJ when it has very few coordinates per processors,
8930  // such as 10. For example, it begins with 1K processor with 1K coordinate
8931  // in each. Then with coarsening this reduces to 10 coordinate per procesor.
8932  // It calls MJ to repartition these to 10 coordinates.
8933  // MJ runs with 1K processor, 10 coordinate in each, and partitions to
8934  // 10 parts. As expected strong scaling is problem here, because
8935  // computation is almost 0, and communication cost of MJ linearly increases.
8936  // Premigration option gathers the coordinates to 10 parts before MJ starts
8937  // therefore MJ will run with a smalller subset of the problem.
8938  // Below, I am migrating the coordinates if mj_premigration_option is set,
8939  // and the result parts are less than the current part count, and the
8940  // average number of local coordinates is less than some threshold.
8941  // For example, premigration may not help if 1000 processors are
8942  // partitioning data to 10, but each of them already have 1M coordinate.
8943  // In that case, we premigration would not help.
8944  int current_world_size = this->mj_problemComm->getSize();
8945  mj_lno_t threshold_num_local_coords =
8946  this->min_coord_per_rank_for_premigration;
8947  bool is_pre_migrated = false;
8948  bool am_i_in_subset = true;
8949 
8950  // Note that we need to add testing for migration and should also cover the
8951  // zoltan case when ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION is defined.
8952  // Currently did a minimal test of this code by running mjTest with
8953  // PM=1, TB=0 then run again with C=3 instead of C=4 (numProcs is 4).
8954  if(mj_premigration_option > 0 &&
8955  size_t (current_world_size) > this->num_global_parts &&
8956  this->num_global_coords < mj_gno_t (
8957  current_world_size * threshold_num_local_coords))
8958  {
8959  if(this->mj_keep_part_boxes) {
8960  throw std::logic_error("Multijagged: mj_keep_part_boxes and "
8961  "mj_premigration_option are not supported together yet.");
8962  }
8963 
8964  is_pre_migrated =true;
8965  int migration_selection_option = mj_premigration_option;
8966  if(migration_selection_option * this->num_global_parts >
8967  (size_t) (current_world_size)) {
8968  migration_selection_option =
8969  current_world_size / this->num_global_parts;
8970  }
8971 
8972  int used_num_ranks = int (this->num_global_coords /
8973  float (threshold_num_local_coords) + 0.5);
8974 
8975  if(used_num_ranks == 0) {
8976  used_num_ranks = 1;
8977  }
8978 
8979  am_i_in_subset = this->mj_premigrate_to_subset(
8980  used_num_ranks,
8981  migration_selection_option,
8982  this->mj_env,
8983  this->mj_problemComm,
8984  this->coord_dim,
8985  this->num_local_coords,
8986  this->num_global_coords,
8987  this->num_global_parts,
8988  this->initial_mj_gnos,
8989  this->mj_coordinates,
8990  this->num_weights_per_coord,
8991  this->mj_weights,
8992  //results
8993  result_problemComm,
8994  result_num_local_coords,
8995  result_initial_mj_gnos,
8996  result_mj_coordinates,
8997  result_mj_weights,
8998  result_actual_owner_rank);
8999 
9000  result_initial_mj_gnos_ = result_initial_mj_gnos;
9001  }
9002 
9003  Kokkos::View<mj_part_t *, device_t> result_assigned_part_ids;
9004  Kokkos::View<mj_gno_t*, device_t> result_mj_gnos;
9005 
9006  this->mj_env->timerStop(MACRO_TIMERS, timer_base_string + "setup");
9007 
9008  if(am_i_in_subset) {
9009  this->mj_partitioner.multi_jagged_part(
9010  this->mj_env,
9011  result_problemComm, //this->mj_problemComm,
9012  this->imbalance_tolerance,
9013  this->num_teams,
9014  this->num_global_parts,
9015  this->part_no_array,
9016  this->recursion_depth,
9017  this->coord_dim,
9018  result_num_local_coords, //this->num_local_coords,
9019  this->num_global_coords,
9020  result_initial_mj_gnos_,
9021  result_mj_coordinates,
9022  this->num_weights_per_coord,
9023  this->mj_uniform_weights,
9024  result_mj_weights,
9025  this->mj_uniform_parts,
9026  result_assigned_part_ids,
9027  result_mj_gnos
9028  );
9029  }
9030 
9031  this->mj_env->timerStart(MACRO_TIMERS, timer_base_string + "cleanup");
9032 
9033  // Reorder results so that they match the order of the input
9034  std::unordered_map<mj_gno_t, mj_lno_t> localGidToLid;
9035  localGidToLid.reserve(result_num_local_coords);
9036  Kokkos::View<mj_gno_t*, Kokkos::HostSpace> host_result_initial_mj_gnos(
9037  Kokkos::ViewAllocateWithoutInitializing("host_result_initial_mj_gnos"),
9038  result_initial_mj_gnos_.size());
9039  Kokkos::deep_copy(host_result_initial_mj_gnos, result_initial_mj_gnos_);
9040  for(mj_lno_t i = 0; i < result_num_local_coords; i++) {
9041  localGidToLid[host_result_initial_mj_gnos(i)] = i;
9042  }
9043 
9044  ArrayRCP<mj_part_t> partId = arcp(new mj_part_t[result_num_local_coords],
9045  0, result_num_local_coords, true);
9046  auto host_result_assigned_part_ids =
9047  Kokkos::create_mirror_view(result_assigned_part_ids);
9048  Kokkos::deep_copy(host_result_assigned_part_ids, result_assigned_part_ids);
9049  auto host_result_mj_gnos = Kokkos::create_mirror_view(result_mj_gnos);
9050  Kokkos::deep_copy(host_result_mj_gnos, result_mj_gnos);
9051  for(mj_lno_t i = 0; i < result_num_local_coords; i++) {
9052  mj_lno_t origLID = localGidToLid[host_result_mj_gnos(i)];
9053  partId[origLID] = host_result_assigned_part_ids(i);
9054  }
9055 
9056  //now the results are reordered. but if premigration occured,
9057  //then we need to send these ids to actual owners again.
9058  if(is_pre_migrated) {
9059  this->mj_env->timerStart(MACRO_TIMERS, timer_base_string +
9060  "PostMigration DistributorPlanCreating");
9061  Tpetra::Distributor distributor(this->mj_problemComm);
9062 
9063  ArrayView<const mj_part_t> actual_owner_destinations(
9064  result_actual_owner_rank , result_num_local_coords);
9065 
9066  mj_lno_t num_incoming_gnos = distributor.createFromSends(
9067  actual_owner_destinations);
9068 
9069  if(num_incoming_gnos != this->num_local_coords) {
9070  throw std::logic_error("Zoltan2 - Multijagged Post Migration - "
9071  "num incoming is not equal to num local coords");
9072  }
9073 
9074  mj_env->timerStop(MACRO_TIMERS, timer_base_string +
9075  "PostMigration DistributorPlanCreating");
9076  mj_env->timerStart(MACRO_TIMERS, timer_base_string +
9077  "PostMigration DistributorMigration");
9078 
9079  Kokkos::View<mj_gno_t*, Kokkos::HostSpace> received_gnos(
9080  Kokkos::ViewAllocateWithoutInitializing("received_gnos"),
9081  num_incoming_gnos);
9082  Kokkos::View<mj_part_t*, Kokkos::HostSpace> received_partids(
9083  Kokkos::ViewAllocateWithoutInitializing("received_partids"),
9084  num_incoming_gnos);
9085 
9086  distributor.doPostsAndWaits(host_result_initial_mj_gnos, 1,
9087  received_gnos);
9088  {
9089  Kokkos::View<mj_part_t*, Kokkos::HostSpace> sent_partnos;
9090  if (partId.size() > 0) {
9091  sent_partnos = Kokkos::View<mj_part_t*, Kokkos::HostSpace>(
9092  partId.getRawPtr(), partId.size()); //unmanaged
9093  }
9094  distributor.doPostsAndWaits(sent_partnos, 1, received_partids);
9095  }
9096 
9097  partId = arcp(new mj_part_t[this->num_local_coords],
9098  0, this->num_local_coords, true);
9099 
9100  {
9101  std::unordered_map<mj_gno_t, mj_lno_t> localGidToLid2;
9102  localGidToLid2.reserve(this->num_local_coords);
9103  auto host_initial_mj_gnos =
9104  Kokkos::create_mirror_view(this->initial_mj_gnos);
9105  Kokkos::deep_copy(host_initial_mj_gnos,
9106  this->initial_mj_gnos);
9107  for(mj_lno_t i = 0; i < this->num_local_coords; i++) {
9108  localGidToLid2[host_initial_mj_gnos(i)] = i;
9109  }
9110 
9111  for(mj_lno_t i = 0; i < this->num_local_coords; i++) {
9112  mj_lno_t origLID = localGidToLid2[received_gnos[i]];
9113  partId[origLID] = received_partids[i];
9114  }
9115  }
9116 
9117  {
9118  delete [] result_actual_owner_rank;
9119  }
9120  mj_env->timerStop(MACRO_TIMERS,
9121  timer_base_string + "PostMigration DistributorMigration");
9122  }
9123  solution->setParts(partId);
9124  this->mj_env->timerStop(MACRO_TIMERS, timer_base_string + "cleanup");
9125  }
9126 
9127  this->mj_env->timerStop(MACRO_TIMERS, timer_base_string + "all");
9128 
9129  // reset the view (release the reference to device data)
9130  this->mj_coordinates = Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t>();
9131 }
9132 
9133 /* \brief Sets the partitioning data for multijagged algorithm.
9134  * */
9135 template <typename Adapter>
9137  const RCP<PartitioningSolution<Adapter> > &solution
9138 )
9139 {
9140  modelFlag_t flags;
9141  CoordinateModel<Adapter> mj_coords(mj_adapter, mj_env, mj_problemComm, flags);
9142 
9143  this->coord_dim = mj_coords.getCoordinateDim();
9144  this->num_weights_per_coord = mj_coords.getNumWeightsPerCoordinate();
9145  this->num_local_coords = mj_coords.getLocalNumCoordinates();
9146  this->num_global_coords = mj_coords.getGlobalNumCoordinates();
9147 
9148  int criteria_dim = (this->num_weights_per_coord ?
9149  this->num_weights_per_coord : 1);
9150  // From the Solution we get part information.
9151  // If the part sizes for a given criteria are not uniform,
9152  // then they are values that sum to 1.0.
9153  this->num_global_parts = solution->getTargetGlobalNumberOfParts();
9154  // allocate only two dimensional pointer.
9155  // raw pointer addresess will be obtained from multivector.
9156  this->mj_uniform_parts = Kokkos::View<bool *, Kokkos::HostSpace>(
9157  "uniform parts", criteria_dim);
9158  this->mj_uniform_weights = Kokkos::View<bool *, Kokkos::HostSpace>(
9159  "uniform weights", criteria_dim);
9160 
9161  Kokkos::View<const mj_gno_t *, device_t> gnos;
9162  Kokkos::View<adapter_scalar_t **, Kokkos::LayoutLeft, device_t> xyz_adapter;
9163  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
9164  Kokkos::View<adapter_scalar_t **, device_t> wgts_adapter;
9165  mj_coords.getCoordinatesKokkos(gnos, xyz_adapter, wgts_adapter);
9166  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
9167  Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t> xyz;
9168  Kokkos::View<mj_scalar_t **, device_t> wgts;
9169 
9170  // Now we must get the data from the adapter.
9171  // If the types match we point to the view but if not, we must copy.
9172  if(std::is_same<mj_scalar_t, adapter_scalar_t>()) {
9173  // we can just point the views but we must specialize because this code
9174  // only compiles in this case - for is_same false assign does nothing.
9175  assign_if_same(xyz, xyz_adapter);
9176  assign_if_same(wgts, wgts_adapter);
9177  }
9178  else {
9179  // we only allocate a new view if we are going to copy
9180  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
9181  xyz = Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t>
9182  (Kokkos::ViewAllocateWithoutInitializing(
9183  "xyz"), xyz_adapter.extent(0), xyz_adapter.extent(1));
9184  wgts = Kokkos::View<mj_scalar_t **, device_t>(
9185  Kokkos::ViewAllocateWithoutInitializing("wgts"),
9186  wgts_adapter.extent(0), wgts_adapter.extent(1));
9187 
9188  typedef typename Kokkos::View<mj_scalar_t **, device_t>::size_type view_size_t;
9189  Kokkos::parallel_for(
9190  Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
9191  (0, xyz_adapter.extent(0)), KOKKOS_LAMBDA (int i) {
9192  for(view_size_t n = 0; n < xyz_adapter.extent(1); ++n) {
9193  xyz(i, n) = static_cast<mj_scalar_t>(xyz_adapter(i, n));
9194  }
9195  });
9196  Kokkos::parallel_for(
9197  Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
9198  (0, wgts.extent(0)), KOKKOS_LAMBDA (int i) {
9199  for(view_size_t n = 0; n < wgts.extent(1); ++n) {
9200  wgts(i, n) = static_cast<mj_scalar_t>(wgts_adapter(i, n));
9201  }
9202  });
9203  }
9204 
9205  // obtain global ids.
9206  this->initial_mj_gnos = gnos;
9207  // extract coordinates from multivector.
9208  this->mj_coordinates = xyz;
9209  // if no weights are provided set uniform weight.
9210 
9211  if(this->num_weights_per_coord == 0) {
9212  this->mj_uniform_weights(0) = true;
9213  Kokkos::resize(this->mj_weights, 0, 0);
9214  }
9215  else{
9216  this->mj_weights = wgts;
9217  for(int wdim = 0; wdim < this->num_weights_per_coord; ++wdim) {
9218  this->mj_uniform_weights(wdim) = false;
9219  }
9220  }
9221 
9222  for(int wdim = 0; wdim < criteria_dim; ++wdim) {
9223  if(solution->criteriaHasUniformPartSizes(wdim)) {
9224  this->mj_uniform_parts(wdim) = true;
9225  }
9226  else {
9227  printf("Error: MJ does not support non uniform target part weights\n");
9228  std::terminate();
9229  }
9230  }
9231 }
9232 
9233 /* \brief Sets the partitioning parameters for multijagged algorithm.
9234  * \param pl: is the parameter list provided to zoltan2 call
9235  * */
9236 template <typename Adapter>
9238  const Teuchos::ParameterList &pl)
9239 {
9240  const Teuchos::ParameterEntry *pe = pl.getEntryPtr("imbalance_tolerance");
9241  if(pe) {
9242  double tol;
9243  tol = pe->getValue(&tol);
9244  this->imbalance_tolerance = tol - 1.0;
9245  }
9246 
9247  // TODO: May be a more relaxed tolerance is needed. RCB uses 10%
9248  if(this->imbalance_tolerance <= 0) {
9249  this->imbalance_tolerance= 10e-4;
9250  }
9251 
9252  // if an input partitioning array is provided.
9253  Kokkos::resize(this->part_no_array, 0);
9254 
9255  // the length of the input partitioning array.
9256  this->recursion_depth = 0;
9257 
9258  if(pl.getPtr<int>("mj_num_teams")) {
9259  this->num_teams = pl.get<int>("mj_num_teams");
9260  }
9261 
9262  if(pl.getPtr<Array <mj_part_t> >("mj_parts")) {
9263  auto mj_parts = pl.get<Array <mj_part_t> >("mj_parts");
9264  int mj_parts_size = static_cast<int>(mj_parts.size());
9265 
9266  // build the view we'll have data on and copy values from host
9267  this->part_no_array = Kokkos::View<mj_part_t*, Kokkos::HostSpace>(
9268  "part_no_array", mj_parts_size);
9269  for(int i = 0; i < mj_parts_size; ++i) {
9270  this->part_no_array(i) = mj_parts.getRawPtr()[i];
9271  }
9272 
9273  this->recursion_depth = mj_parts_size - 1;
9274  this->mj_env->debug(2, "mj_parts provided by user");
9275  }
9276 
9277  // get mj specific parameters.
9278  this->distribute_points_on_cut_lines = true;
9279  this->max_concurrent_part_calculation = 1;
9280 
9281  this->mj_run_as_rcb = false;
9282  this->mj_premigration_option = 0;
9283  this->min_coord_per_rank_for_premigration = 32000;
9284 
9285  int mj_user_recursion_depth = -1;
9286  this->mj_keep_part_boxes = false;
9287  this->check_migrate_avoid_migration_option = 0;
9288  this->migration_type = 0;
9289  this->minimum_migration_imbalance = 0.35;
9290 
9291  pe = pl.getEntryPtr("mj_minimum_migration_imbalance");
9292  if(pe) {
9293  double imb;
9294  imb = pe->getValue(&imb);
9295  this->minimum_migration_imbalance = imb - 1.0;
9296  }
9297 
9298  pe = pl.getEntryPtr("mj_migration_option");
9299  if(pe) {
9300  this->check_migrate_avoid_migration_option =
9301  pe->getValue(&this->check_migrate_avoid_migration_option);
9302  } else {
9303  this->check_migrate_avoid_migration_option = 0;
9304  }
9305  if(this->check_migrate_avoid_migration_option > 1) {
9306  this->check_migrate_avoid_migration_option = -1;
9307  }
9308 
9310  pe = pl.getEntryPtr("mj_migration_type");
9311  if(pe) {
9312  this->migration_type = pe->getValue(&this->migration_type);
9313  } else {
9314  this->migration_type = 0;
9315  }
9316 
9317  //std::cout << " this->migration_type:" << this->migration_type << std::endl;
9319 
9320  pe = pl.getEntryPtr("mj_concurrent_part_count");
9321  if(pe) {
9322  this->max_concurrent_part_calculation =
9323  pe->getValue(&this->max_concurrent_part_calculation);
9324  } else {
9325  this->max_concurrent_part_calculation = 1; // Set to 1 if not provided.
9326  }
9327 
9328  pe = pl.getEntryPtr("mj_keep_part_boxes");
9329  if(pe) {
9330  this->mj_keep_part_boxes = pe->getValue(&this->mj_keep_part_boxes);
9331  } else {
9332  this->mj_keep_part_boxes = false; // Set to invalid value
9333  }
9334 
9335  // For now, need keep_part_boxes to do pointAssign and boxAssign.
9336  // pe = pl.getEntryPtr("keep_cuts");
9337  // if(pe) {
9338  // int tmp = pe->getValue(&tmp);
9339  // if(tmp) this->mj_keep_part_boxes = true;
9340  // }
9341 
9342  //need to keep part boxes if mapping type is geometric.
9343  if(this->mj_keep_part_boxes == false) {
9344  pe = pl.getEntryPtr("mapping_type");
9345  if(pe) {
9346  int mapping_type = -1;
9347  mapping_type = pe->getValue(&mapping_type);
9348  if(mapping_type == 0) {
9349  mj_keep_part_boxes = true;
9350  }
9351  }
9352  }
9353 
9354  // need to keep part boxes if mapping type is geometric.
9355  pe = pl.getEntryPtr("mj_enable_rcb");
9356  if(pe) {
9357  this->mj_run_as_rcb = pe->getValue(&this->mj_run_as_rcb);
9358  } else {
9359  this->mj_run_as_rcb = false; // Set to invalid value
9360  }
9361 
9362  pe = pl.getEntryPtr("mj_premigration_option");
9363  if(pe) {
9364  mj_premigration_option = pe->getValue(&mj_premigration_option);
9365  } else {
9366  mj_premigration_option = 0;
9367  }
9368 
9369  pe = pl.getEntryPtr("mj_premigration_coordinate_count");
9370  if(pe) {
9371  min_coord_per_rank_for_premigration = pe->getValue(&mj_premigration_option);
9372  } else {
9373  min_coord_per_rank_for_premigration = 32000;
9374  }
9375 
9376  pe = pl.getEntryPtr("mj_recursion_depth");
9377  if(pe) {
9378  mj_user_recursion_depth = pe->getValue(&mj_user_recursion_depth);
9379  } else {
9380  mj_user_recursion_depth = -1; // Set to invalid value
9381  }
9382 
9383  bool val = false;
9384  pe = pl.getEntryPtr("rectilinear");
9385  if(pe) {
9386  val = pe->getValue(&val);
9387  }
9388  if(val) {
9389  this->distribute_points_on_cut_lines = false;
9390  } else {
9391  this->distribute_points_on_cut_lines = true;
9392  }
9393 
9394  if(this->mj_run_as_rcb) {
9395  mj_user_recursion_depth =
9396  (int)(ceil(log ((this->num_global_parts)) / log (2.0)));
9397  }
9398  if(this->recursion_depth < 1) {
9399  if(mj_user_recursion_depth > 0) {
9400  this->recursion_depth = mj_user_recursion_depth;
9401  }
9402  else {
9403  this->recursion_depth = this->coord_dim;
9404  }
9405  }
9406 }
9407 
9409 template <typename Adapter>
9411  int dim,
9412  adapter_scalar_t *lower,
9413  adapter_scalar_t *upper,
9414  size_t &nPartsFound,
9415  typename Adapter::part_t **partsFound) const
9416 {
9417  // TODO: Implement with cuts rather than boxes to reduce algorithmic
9418  // TODO: complexity. Or at least do a search through the boxes, using
9419  // TODO: p x q x r x ... if possible.
9420 
9421  nPartsFound = 0;
9422  *partsFound = NULL;
9423 
9424  if(this->mj_keep_part_boxes) {
9425 
9426  // Get vector of part boxes
9427  RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
9428 
9429  size_t nBoxes = (*partBoxes).size();
9430  if(nBoxes == 0) {
9431  throw std::logic_error("no part boxes exist");
9432  }
9433 
9434  // Determine whether the box overlaps the globalBox at all
9435  RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
9436 
9437  if(globalBox->boxesOverlap(dim, lower, upper)) {
9438 
9439  std::vector<typename Adapter::part_t> partlist;
9440 
9441  // box overlaps the global box; find specific overlapping boxes
9442  for(size_t i = 0; i < nBoxes; i++) {
9443  try {
9444  if((*partBoxes)[i].boxesOverlap(dim, lower, upper)) {
9445  nPartsFound++;
9446  partlist.push_back((*partBoxes)[i].getpId());
9447  /*
9448  std::cout << "Given box (";
9449  for(int j = 0; j < dim; j++)
9450  std::cout << lower[j] << " ";
9451  std::cout << ") x (";
9452  for(int j = 0; j < dim; j++)
9453  std::cout << upper[j] << " ";
9454  std::cout << ") overlaps PartBox "
9455  << (*partBoxes)[i].getpId() << " (";
9456  for(int j = 0; j < dim; j++)
9457  std::cout << (*partBoxes)[i].getlmins()[j] << " ";
9458  std::cout << ") x (";
9459  for(int j = 0; j < dim; j++)
9460  std::cout << (*partBoxes)[i].getlmaxs()[j] << " ";
9461  std::cout << ")" << std::endl;
9462  */
9463  }
9464  }
9466  }
9467  if(nPartsFound) {
9468  *partsFound = new mj_part_t[nPartsFound];
9469  for(size_t i = 0; i < nPartsFound; i++)
9470  (*partsFound)[i] = partlist[i];
9471  }
9472  }
9473  else {
9474  // Box does not overlap the domain at all. Find the closest part
9475  // Not sure how to perform this operation for MJ without having the
9476  // cuts. With the RCB cuts, the concept of a part extending to
9477  // infinity was natural. With the boxes, it is much more difficult.
9478  // TODO: For now, return information indicating NO OVERLAP.
9479  }
9480  }
9481  else {
9482  throw std::logic_error("need to use keep_cuts parameter for boxAssign");
9483  }
9484 }
9485 
9487 template <typename Adapter>
9489  int dim,
9490  adapter_scalar_t *point) const
9491 {
9492  // TODO: Implement with cuts rather than boxes to reduce algorithmic
9493  // TODO: complexity. Or at least do a search through the boxes, using
9494  // TODO: p x q x r x ... if possible.
9495 
9496  if(this->mj_keep_part_boxes) {
9497  typename Adapter::part_t foundPart = -1;
9498 
9499  // Get vector of part boxes
9500  RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
9501 
9502  size_t nBoxes = (*partBoxes).size();
9503  if(nBoxes == 0) {
9504  throw std::logic_error("no part boxes exist");
9505  }
9506 
9507  // Determine whether the point is within the global domain
9508  RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
9509 
9510  if(globalBox->pointInBox(dim, point)) {
9511 
9512  // point is in the global domain; determine in which part it is.
9513  size_t i;
9514  for(i = 0; i < nBoxes; i++) {
9515  try {
9516  if((*partBoxes)[i].pointInBox(dim, point)) {
9517  foundPart = (*partBoxes)[i].getpId();
9518  // std::cout << "Point (";
9519  // for(int j = 0; j < dim; j++) std::cout << point[j] << " ";
9520  // std::cout << ") found in box " << i << " part " << foundPart
9521  // << std::endl;
9522  // (*partBoxes)[i].print();
9523  break;
9524  }
9525  }
9527  }
9528 
9529  if(i == nBoxes) {
9530  // This error should never occur
9531  std::ostringstream oss;
9532  oss << "Point (";
9533  for(int j = 0; j < dim; j++) oss << point[j] << " ";
9534  oss << ") not found in domain";
9535  throw std::logic_error(oss.str());
9536  }
9537  }
9538 
9539  else {
9540  // Point is outside the global domain.
9541  // Determine to which part it is closest.
9542  // TODO: with cuts, would not need this special case
9543 
9544  typedef typename Zoltan2::coordinateModelPartBox::coord_t coord_t;
9545  size_t closestBox = 0;
9546  coord_t minDistance = std::numeric_limits<coord_t>::max();
9547  coord_t *centroid = new coord_t[dim];
9548  for(size_t i = 0; i < nBoxes; i++) {
9549  (*partBoxes)[i].computeCentroid(centroid);
9550  coord_t sum = 0.;
9551  coord_t diff;
9552  for(int j = 0; j < dim; j++) {
9553  diff = centroid[j] - point[j];
9554  sum += diff * diff;
9555  }
9556  if(sum < minDistance) {
9557  minDistance = sum;
9558  closestBox = i;
9559  }
9560  }
9561  foundPart = (*partBoxes)[closestBox].getpId();
9562  delete [] centroid;
9563  }
9564 
9565  return foundPart;
9566  }
9567  else {
9568  throw std::logic_error("need to use keep_cuts parameter for pointAssign");
9569  }
9570 }
9571 
9572 template <typename Adapter>
9574  const PartitioningSolution<Adapter> *solution,
9575  ArrayRCP<typename Zoltan2_AlgMJ<Adapter>::mj_part_t> &comXAdj,
9576  ArrayRCP<typename Zoltan2_AlgMJ<Adapter>::mj_part_t> &comAdj)
9577 {
9578  if(comXAdj_.getRawPtr() == NULL && comAdj_.getRawPtr() == NULL) {
9579  RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
9580  mj_part_t ntasks = (*pBoxes).size();
9581  int dim = (*pBoxes)[0].getDim();
9582  GridHash grid(pBoxes, ntasks, dim);
9583  grid.getAdjArrays(comXAdj_, comAdj_);
9584  }
9585  comAdj = comAdj_;
9586  comXAdj = comXAdj_;
9587 }
9588 
9589 template <typename Adapter>
9590 RCP<typename Zoltan2_AlgMJ<Adapter>::mj_partBoxVector_t>
9592 {
9593  return this->mj_partitioner.get_kept_boxes();
9594 }
9595 } // namespace Zoltan2
9596 
9597 #endif
Zoltan2_MJArrayType< scalar_t > value_type
KOKKOS_INLINE_FUNCTION Zoltan2_MJArrayType()
KOKKOS_INLINE_FUNCTION void init(value_type dst) const
Kokkos::View< index_t *, device_t > part_xadj
GridHash Class, Hashing Class for part boxes.
void set_up_partitioning_data(const RCP< PartitioningSolution< Adapter > > &solution)
Time an algorithm (or other entity) as a whole.
global_size_t getGlobalNumCoordinates() const
Returns the global number coordinates.
void set(IT index_, CT count_, WT *vals_)
KOKKOS_INLINE_FUNCTION void join(value_type &dst, const value_type &src) const
#define Z2_FORWARD_EXCEPTIONS
Forward an exception back through call stack.
KOKKOS_INLINE_FUNCTION void operator()(const member_type &teamMember, value_type teamSum) const
Kokkos::View< index_t *, device_t > track_on_cuts
Defines Parameter related enumerators, declares functions.
KOKKOS_INLINE_FUNCTION ArrayReducer(value_type &val, int mj_value_count)
static RCP< Teuchos::BoolParameterEntryValidator > getBoolValidator()
Exists to make setting up validators less cluttered.
KOKKOS_INLINE_FUNCTION void join(value_type dst, const value_type src) const
void getAdjArrays(ArrayRCP< part_t > &comXAdj_, ArrayRCP< part_t > &comAdj_)
GridHash Class, returns the adj arrays.
Zoltan2_AlgMJ(const RCP< const Environment > &env, RCP< const Comm< int > > &problemComm, const RCP< const typename Adapter::base_adapter_t > &adapter)
Zoltan2_BoxBoundaries(Ordinal s_)
Constructor.
Kokkos::View< scalar_t *, device_t > coordinates
Sort items for quick sort function.
typename node_t::device_type device_t
std::bitset< NUM_MODEL_FLAGS > modelFlag_t
void uqSignsort(IT n, uSignedSortItem< IT, WT, SIGN > *arr)
Quick sort function. Sorts the arr of uSignedSortItems, with respect to increasing vals...
Kokkos::View< index_t *, device_t > permutations
map_t::global_ordinal_type gno_t
Definition: mapRemotes.cpp:27
typename Zoltan2::InputTraits< ztcrsmatrix_t >::node_t node_t
Class for sorting items with multiple values. First sorting with respect to val[0], then val[1] then ... val[count-1]. The last tie breaking is done with index values. Used for task mapping partitioning where the points on a cut line needs to be distributed consistently.
static RCP< Teuchos::AnyNumberParameterEntryValidator > getAnyDoubleValidator()
Exists to make setting up validators less cluttered.
Kokkos::View< scalar_t **, device_t > weights
KOKKOS_INLINE_FUNCTION void join(value_type dst, const value_type src) const
void partition(const RCP< PartitioningSolution< Adapter > > &solution)
Multi Jagged coordinate partitioning algorithm.
void reduce(const Ordinal count, const T inBuffer[], T inoutBuffer[]) const
Implement Teuchos::ValueTypeReductionOp interface.
KOKKOS_INLINE_FUNCTION value_type & reference() const
Kokkos::View< scalar_t * > scalar_view_t
coordinateModelPartBox Class, represents the boundaries of the box which is a result of a geometric p...
KOKKOS_INLINE_FUNCTION void init(value_type &dst) const
void set_to_keep_part_boxes()
Function call, if the part boxes are intended to be kept.
A ParameterList validator for integer range lists.
void getCommunicationGraph(const PartitioningSolution< Adapter > *solution, ArrayRCP< mj_part_t > &comXAdj, ArrayRCP< mj_part_t > &comAdj)
returns communication graph resulting from MJ partitioning.
SparseMatrixAdapter_t::part_t part_t
Multi Jagged coordinate partitioning algorithm.
#define epsilon
Definition: nd.cpp:47
This class provides geometric coordinates with optional weights to the Zoltan2 algorithm.
void boxAssign(int dim, adapter_scalar_t *lower, adapter_scalar_t *upper, size_t &nPartsFound, mj_part_t **partsFound) const
tuple root
Definition: validXML.py:24
Kokkos::View< scalar_t *, device_t > cut_coordinates
A PartitioningSolution is a solution to a partitioning problem.
Zoltan2_BoxBoundaries()
Default Constructor.
Kokkos::View< index_t *, device_t > permutations
void set_partitioning_parameters(bool distribute_points_on_cut_lines_, int max_concurrent_part_calculation_, int check_migrate_avoid_migration_option_, double minimum_migration_imbalance_, int migration_type_=0)
Multi Jagged coordinate partitioning algorithm.
KOKKOS_INLINE_FUNCTION value_type & reference() const
size_t getCoordinatesKokkos(Kokkos::View< const gno_t *, typename node_t::device_type > &Ids, Kokkos::View< scalar_t **, Kokkos::LayoutLeft, typename node_t::device_type > &xyz, Kokkos::View< scalar_t **, typename node_t::device_type > &wgts) const
Returns the coordinate ids, values and optional weights.
AlgMJ()
Multi Jagged coordinate partitioning algorithm default constructor.
Kokkos::View< part_t *, device_t > parts
Zoltan2_BoxBoundaries is a reduction operation to all reduce the all box boundaries.
ReduceWeightsFunctor(int mj_loop_count, array_t mj_max_scalar, part_t mj_concurrent_current_part, part_t mj_num_cuts, part_t mj_current_work_part, part_t mj_current_concurrent_num_parts, part_t mj_left_right_array_size, part_t mj_weight_array_size, Kokkos::View< index_t *, device_t > &mj_permutations, Kokkos::View< scalar_t *, device_t > &mj_coordinates, Kokkos::View< scalar_t **, device_t > &mj_weights, Kokkos::View< part_t *, device_t > &mj_parts, Kokkos::View< scalar_t *, device_t > &mj_cut_coordinates, Kokkos::View< index_t *, device_t > &mj_part_xadj, bool mj_uniform_weights0, scalar_t mj_sEpsilon)
Algorithm defines the base class for all algorithms.
map_t::local_ordinal_type lno_t
Definition: mapRemotes.cpp:26
mj_partBoxVector_t & getPartBoxesView() const
for partitioning methods, return bounding boxes of the
#define Z2_ASSERT_VALUE(actual, expected)
Throw an error when actual value is not equal to expected value.
RCP< mj_partBoxVector_t > get_kept_boxes() const
DOCWORK: Documentation.
static RCP< Teuchos::AnyNumberParameterEntryValidator > getAnyIntValidator()
Exists to make setting up validators less cluttered.
Kokkos::View< part_t *, device_t > parts
RCP< mj_partBoxVector_t > compute_global_box_boundaries(RCP< mj_partBoxVector_t > &localPartBoxes) const
DOCWORK: Documentation.
uMultiSortItem(IT index_, CT count_, WT *vals_)
KOKKOS_INLINE_FUNCTION ArrayCombinationReducer(scalar_t mj_max_scalar, value_type &val, int mj_value_count_rightleft, int mj_value_count_weights)
KOKKOS_INLINE_FUNCTION void init(value_type dst) const
size_t getLocalNumCoordinates() const
Returns the number of coordinates on this process.
int getNumWeightsPerCoordinate() const
Returns the number (0 or greater) of weights per coordinate.
Define IntegerRangeList validator.
size_t team_shmem_size(int team_size) const
Defines the CoordinateModel classes.
void multi_jagged_part(const RCP< const Environment > &env, RCP< const Comm< int > > &problemComm, double imbalance_tolerance, int num_teams, size_t num_global_parts, Kokkos::View< mj_part_t *, Kokkos::HostSpace > &part_no_array, int recursion_depth, int coord_dim, mj_lno_t num_local_coords, mj_gno_t num_global_coords, Kokkos::View< const mj_gno_t *, device_t > &initial_mj_gnos, Kokkos::View< mj_scalar_t **, Kokkos::LayoutLeft, device_t > &mj_coordinates, int num_weights_per_coord, Kokkos::View< bool *, Kokkos::HostSpace > &mj_uniform_weights, Kokkos::View< mj_scalar_t **, device_t > &mj_weights, Kokkos::View< bool *, Kokkos::HostSpace > &mj_uniform_parts, Kokkos::View< mj_part_t *, device_t > &result_assigned_part_ids, Kokkos::View< mj_gno_t *, device_t > &result_mj_gnos)
Multi Jagged coordinate partitioning algorithm.
KOKKOS_INLINE_FUNCTION void join(value_type &dst, const value_type &src) const
Kokkos::View< scalar_t * > scalar_view_t
Tpetra::global_size_t global_size_t
Zoltan2_MJArrayType< scalar_t > value_type
mj_part_t pointAssign(int dim, adapter_scalar_t *point) const
KOKKOS_INLINE_FUNCTION void init(value_type &dst) const
#define Z2_THROW_OUTSIDE_ERROR(env)
Throw an error returned from outside the Zoltan2 library.
int getCoordinateDim() const
Returns the dimension of the coordinates.
Kokkos::View< scalar_t *, device_t > coordinates
ReduceArrayFunctor(part_t mj_concurrent_current_part, part_t mj_weight_array_size, Kokkos::View< index_t *, device_t > &mj_permutations, Kokkos::View< scalar_t *, device_t > &mj_coordinates, Kokkos::View< part_t *, device_t > &mj_parts, Kokkos::View< index_t *, device_t > &mj_part_xadj, Kokkos::View< index_t *, device_t > &mj_track_on_cuts)
Kokkos::View< index_t *, device_t > part_xadj
KOKKOS_INLINE_FUNCTION Zoltan2_MJArrayType(scalar_t *pSetPtr)
A gathering of useful namespace methods.
void uqsort(IT n, uSortItem< IT, WT > *arr)
Quick sort function. Sorts the arr of uSortItems, with respect to increasing vals. DOCWORK: Document input params.
Contains Teuchos redcution operators for the Multi-jagged algorthm.
RCP< mj_partBox_t > get_global_box() const
DOCWORK: Documentation.
KOKKOS_INLINE_FUNCTION void join(volatile value_type &dst, const volatile value_type &src) const
size_t team_shmem_size(int team_size) const
Multi Jagged coordinate partitioning algorithm.
static void getValidParameters(ParameterList &pl)
Set up validators specific to this algorithm.
void sequential_task_partitioning(const RCP< const Environment > &env, mj_lno_t num_total_coords, mj_lno_t num_selected_coords, size_t num_target_part, int coord_dim, Kokkos::View< mj_scalar_t **, Kokkos::LayoutLeft, device_t > &mj_coordinates_, Kokkos::View< mj_lno_t *, device_t > &initial_selected_coords_output_permutation, mj_lno_t *output_xadj, int recursion_depth_, const Kokkos::View< mj_part_t *, Kokkos::HostSpace > &part_no_array, bool partition_along_longest_dim, int num_ranks_per_node, bool divide_to_prime_first_, mj_part_t num_first_level_parts_=1, const Kokkos::View< mj_part_t *, Kokkos::HostSpace > &first_level_distribution_=Kokkos::View< mj_part_t *, Kokkos::HostSpace >())
Special function for partitioning for task mapping. Runs sequential, and performs deterministic parti...