Zoltan2
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
Zoltan2_AlgMultiJagged.hpp
Go to the documentation of this file.
1 // @HEADER
2 //
3 // ***********************************************************************
4 //
5 // Zoltan2: A package of combinatorial algorithms for scientific computing
6 // Copyright 2012 Sandia Corporation
7 //
8 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
9 // the U.S. Government retains certain rights in this software.
10 //
11 // Redistribution and use in source and binary forms, with or without
12 // modification, are permitted provided that the following conditions are
13 // met:
14 //
15 // 1. Redistributions of source code must retain the above copyright
16 // notice, this list of conditions and the following disclaimer.
17 //
18 // 2. Redistributions in binary form must reproduce the above copyright
19 // notice, this list of conditions and the following disclaimer in the
20 // documentation and/or other materials provided with the distribution.
21 //
22 // 3. Neither the name of the Corporation nor the names of the
23 // contributors may be used to endorse or promote products derived from
24 // this software without specific prior written permission.
25 //
26 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
27 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
30 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 //
38 // Questions? Contact Karen Devine (kddevin@sandia.gov)
39 // Erik Boman (egboman@sandia.gov)
40 // Siva Rajamanickam (srajama@sandia.gov)
41 //
42 // ***********************************************************************
43 //
44 // @HEADER
49 #ifndef _ZOLTAN2_ALGMultiJagged_HPP_
50 #define _ZOLTAN2_ALGMultiJagged_HPP_
51 
54 #include <Zoltan2_Parameters.hpp>
55 #include <Zoltan2_Algorithm.hpp>
58 #include <Zoltan2_Util.hpp>
59 #include <Tpetra_Distributor.hpp>
60 #include <Teuchos_StandardParameterEntryValidators.hpp>
61 #include <Teuchos_ParameterList.hpp>
62 #include <Kokkos_Sort.hpp>
63 
64 #include <algorithm> // std::sort
65 #include <vector>
66 #include <unordered_map>
67 
68 #ifdef ZOLTAN2_USEZOLTANCOMM
69 #ifdef HAVE_ZOLTAN2_MPI
70 #define ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
71 #include "zoltan_comm_cpp.h"
72 #include "zoltan_types.h" // for error codes
73 #endif
74 #endif
75 
76 namespace Teuchos{
77 
81 template <typename Ordinal, typename T>
82 class Zoltan2_BoxBoundaries : public ValueTypeReductionOp<Ordinal,T>
83 {
84 private:
85  Ordinal size;
86  T epsilon;
87 
88 public:
91  Zoltan2_BoxBoundaries() : size(0),
92  epsilon(std::numeric_limits<T>::epsilon()) {}
93 
97  Zoltan2_BoxBoundaries(Ordinal s_):
98  size(s_), epsilon(std::numeric_limits<T>::epsilon()) {}
99 
105  void reduce( const Ordinal count, const T inBuffer[], T inoutBuffer[]) const {
106  for(Ordinal i = 0; i < count; i++) {
107  if(Z2_ABS(inBuffer[i]) > epsilon) {
108  inoutBuffer[i] = inBuffer[i];
109  }
110  }
111  }
112 };
113 
114 } // namespace Teuchos
115 
116 namespace Zoltan2{
117 
124 template <typename IT, typename CT, typename WT>
126 {
127 public:
128  // TODO: Why volatile?
129  // no idea, another intel compiler failure.
130  volatile IT index;
131  volatile CT count;
132  volatile WT *val;
133  volatile WT epsilon;
134 
136  this->index = 0;
137  this->count = 0;
138  this->val = NULL;
140  }
141 
142  // TODO: Document these methods?
143  uMultiSortItem(IT index_ ,CT count_, WT *vals_) {
144  this->index = index_;
145  this->count = count_;
146  this->val = vals_;
148  }
149 
151  }
152 
153  void set(IT index_ ,CT count_, WT *vals_) {
154  this->index = index_;
155  this->count = count_;
156  this->val = vals_;
157  }
158 
159  bool operator<(const uMultiSortItem<IT,CT,WT>& other) const {
160  assert(this->count == other.count);
161  for(CT i = 0; i < this->count; ++i) {
162  // if the values are equal go to next one.
163  if(std::abs(this->val[i] - other.val[i]) < this->epsilon) {
164  continue;
165  }
166  // if next value is smaller return true;
167  if(this->val[i] < other.val[i]) {
168  return true;
169  }
170  // if next value is bigger return false;
171  else {
172  return false;
173  }
174  }
175  // if they are totally equal.
176  return this->index < other.index;
177  }
178 };
179 
182 template <class IT, class WT>
183 struct uSortItem
184 {
185  IT id;
186  WT val;
187 };
188 
193 template <class IT, class WT>
194 void uqsort(IT n, uSortItem<IT, WT> * arr) {
195  const int NSTACK = 50;
196  int M = 7;
197  IT i, ir=n, j, k, l=1;
198  IT jstack=0, istack[NSTACK];
199  WT aval;
201 
202  --arr;
203  for(;;) {
204  if(ir-l < M) {
205  for(j=l+1;j<=ir;j++) {
206  a=arr[j];
207  aval = a.val;
208  for(i=j-1;i>=1;i--) {
209  if(arr[i].val <= aval)
210  break;
211  arr[i+1] = arr[i];
212  }
213  arr[i+1]=a;
214  }
215  if(jstack == 0)
216  break;
217  ir=istack[jstack--];
218  l=istack[jstack--];
219  }
220  else {
221  k=(l+ir) >> 1;
222  std::swap(arr[k],arr[l+1]);
223  if(arr[l+1].val > arr[ir].val) {
224  std::swap(arr[l+1],arr[ir]);
225  }
226  if(arr[l].val > arr[ir].val) {
227  std::swap(arr[l],arr[ir]);
228  }
229  if(arr[l+1].val > arr[l].val) {
230  std::swap(arr[l+1],arr[l]);
231  }
232  i=l+1;
233  j=ir;
234  a=arr[l];
235  aval = a.val;
236  for(;;) {
237  do i++; while (arr[i].val < aval);
238  do j--; while (arr[j].val > aval);
239  if(j < i) break;
240  std::swap(arr[i],arr[j]);
241  }
242  arr[l]=arr[j];
243  arr[j]=a;
244  jstack += 2;
245  if(jstack > NSTACK) {
246  std::cout << "uqsort: NSTACK too small in sort." << std::endl;
247  std::terminate();
248  }
249  if(ir-i+1 >= j-l) {
250  istack[jstack]=ir;
251  istack[jstack-1]=i;
252  ir=j-1;
253  }
254  else {
255  istack[jstack]=j-1;
256  istack[jstack-1]=l;
257  l=i;
258  }
259  }
260  }
261 }
262 
263 template <class IT, class WT, class SIGN>
265 {
266  IT id;
267  WT val;
268  SIGN signbit; // 1 means positive, 0 means negative.
269  bool operator<(const uSignedSortItem<IT, WT, SIGN>& rhs) const {
270  /*if I am negative, the other is positive*/
271  if(this->signbit < rhs.signbit) {
272  return true;
273  }
274  /*if both has the same sign*/
275  else if(this->signbit == rhs.signbit) {
276  if(this->val < rhs.val) {//if my value is smaller,
277  return this->signbit;//then if we both are positive return true.
278  //if we both are negative, return false.
279  }
280  else if(this->val > rhs.val) {//if my value is larger,
281  return !this->signbit; //then if we both are positive return false.
282  //if we both are negative, return true.
283  }
284  else { //if both are equal.
285  return false;
286  }
287  }
288  else {
289  /*if I am positive, the other is negative*/
290  return false;
291  }
292  }
293 
294  bool operator<=(const uSignedSortItem<IT, WT, SIGN>& rhs) {
295  return (this->val == rhs.val && this->signbit == rhs.signbit) || (*this < rhs);
296  }
297 };
298 
302 template <class IT, class WT, class SIGN>
304  const IT NSTACK = 50;
305  IT M = 7;
306  IT i, ir=n, j, k, l=1;
307  IT jstack=0, istack[NSTACK];
309 
310  --arr;
311  for(;;) {
312  if(ir < M + l) {
313  for(j=l+1;j<=ir;j++) {
314  a=arr[j];
315  for(i=j-1;i>=1;i--) {
316  if(arr[i] <= a) {
317  break;
318  }
319  arr[i+1] = arr[i];
320  }
321  arr[i+1]=a;
322  }
323  if(jstack == 0) {
324  break;
325  }
326  ir=istack[jstack--];
327  l=istack[jstack--];
328  }
329  else {
330  k=(l+ir) >> 1;
331  std::swap(arr[k],arr[l+1]);
332  if(arr[ir] < arr[l+1]) {
333  std::swap(arr[l+1],arr[ir]);
334  }
335  if(arr[ir] < arr[l] ) {
336  std::swap(arr[l],arr[ir]);
337  }
338  if(arr[l] < arr[l+1]) {
339  std::swap(arr[l+1],arr[l]);
340  }
341  i=l+1;
342  j=ir;
343  a=arr[l];
344  for(;;) {
345  do i++; while (arr[i] < a);
346  do j--; while (a < arr[j]);
347  if(j < i) break;
348  std::swap(arr[i],arr[j]);
349  }
350  arr[l]=arr[j];
351  arr[j]=a;
352  jstack += 2;
353  if(jstack > NSTACK) {
354  std::cout << "uqsort: NSTACK too small in sort." << std::endl;
355  std::terminate();
356  }
357  if(ir+l+1 >= j+i) {
358  istack[jstack]=ir;
359  istack[jstack-1]=i;
360  ir=j-1;
361  }
362  else {
363  istack[jstack]=j-1;
364  istack[jstack-1]=l;
365  l=i;
366  }
367  }
368  }
369 }
370 
371 // This exists only so we can track how many times the MJ algorithm is
372 // called and put each of those into different timer names.
373 // Currently the MultiJaggedTest.cpp will actually call it twice.
374 // First time with data from a Tpetra MultiVector and then a second time using
375 // a BasicVectorAdapter which allows us to turn UVM off for some tests. The
376 // results of the two runs are compared which helps to catch a lot of bugs. For
377 // profiling I'm mostly just interested in the UVM off case and need it to be
378 // in separate timers. Passing a value through would mess up the API. Possibly
379 // we could check the Adapter and use that. The statics have to be outside the
380 // templated class as the two called instances will be different template
381 // parameters. Another complication is that MultiJagged.cpp will call through
382 // the Zoltan2_AlgMJ class and we want to time things in both classes. However
383 // TaskMapper will directly call AlgMJ so I made two counters for the two
384 // classes to make sure it was always correct. This does not impact any
385 // behavior and has the sole purpose of generating unique timer names. If you
386 // run an MJ test you'll see MJ(0) and MJ(1) in the names to distinguish the
387 // 1st and 2nd run. Right now only MultijaggedTest.cpp cares about this.
389  static int get_counter_AlgMJ() {
390  static int counter = 0;
391  return counter++;
392  }
394  static int counter = 0;
395  return counter++;
396  }
397 };
398 
401 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
402  typename mj_part_t, typename mj_node_t>
403 class AlgMJ
404 {
405 private:
406  typedef typename mj_node_t::device_type device_t; // for views
408  typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
409 
410  //if the (last dimension reduce all count) x the mpi world size
411  //estimated to be bigger than this number then migration will be forced
412  //in earlier iterations.
413  static constexpr size_t future_reduceall_cutoff = 1500000;
414 
415  //if parts right before last dimension are estimated to have less than
416  //MIN_WORK_LAST_DIM many coords, migration will be forced in earlier iterations.
417  static constexpr mj_lno_t min_work_last_dim = 1000;
418 
419  static constexpr mj_scalar_t least_signifiance = 0.0001;
420  static constexpr int significance_mul = 1000;
421 
422  std::string mj_timer_base_string; // for convenience making timer names
423 
424  RCP<const Environment> mj_env; // the environment object
425  RCP<const Comm<int> > mj_problemComm; // initial comm object
426  RCP<Comm<int> > comm; // comm object than can be altered during execution
427  double imbalance_tolerance; // input imbalance tolerance.
428  int recursion_depth; // number of steps that partitioning will be solved in.
429  int coord_dim; // coordinate dim
430  int num_weights_per_coord; // # of weights per coord
431  size_t initial_num_loc_coords; // initial num local coords.
432  global_size_t initial_num_glob_coords; // initial num global coords.
433  mj_lno_t num_local_coords; // number of local coords.
434  mj_gno_t num_global_coords; // number of global coords.
435  mj_scalar_t sEpsilon; // epsilon for mj_scalar_t
436 
437  // can distribute points on same coordinant to different parts.
438  bool distribute_points_on_cut_lines;
439 
440  // how many parts we can calculate concurrently.
441  mj_part_t max_concurrent_part_calculation;
442 
443  bool mj_run_as_rcb; // means recursion depth is adjusted to maximum value.
444  int mj_user_recursion_depth; // the recursion depth value provided by user.
445  bool mj_keep_part_boxes; // if the boxes need to be kept.
446 
447  // whether to migrate=1, avoid migrate=2, or leave decision to MJ=0
448  int check_migrate_avoid_migration_option;
449 
450  // when doing the migration, 0 will aim for perfect load-imbalance, 1 - will
451  // aim for minimized number of messages with possibly bad load-imbalance
452  int migration_type;
453 
454  // when MJ decides whether to migrate, the minimum imbalance for migration.
455  double minimum_migration_imbalance;
456 
457  // Nonuniform first level partitioning
458  // (Currently available only for sequential_task_partitioning):
459  // Used for Dragonfly task mapping by partitioning Dragonfly RCA
460  // machine coordinates and application coordinates.
461  // An optimization that completely partitions the most important machine dimension
462  // first (i.e. the Dragonfly group coordinate, or RCA's x coordinate). The standard
463  // MJ alg follows after the nonuniform first level partitioning.
464  //
465  // Ex. (first level partitioning): If we have 120 elements,
466  // num_first_level_parts = 3, first_level_distribution = [4, 10, 6], then
467  // part sizes after first level will be [24, 60, 36]. Standard uniform MJ
468  // continues for all subsequent levels.
469 
470  // If used, number of parts requested for a nonuniform
471  // first level partitioning
472  mj_part_t num_first_level_parts;
473 
474  // If used, the requested distribution of parts for the
475  // nonuniform first level partitioning
476  Kokkos::View<mj_part_t*, Kokkos::HostSpace> first_level_distribution;
477 
478  mj_part_t total_num_cut ; // how many cuts will be totally
479  mj_part_t total_num_part; // how many parts will be totally
480 
481  mj_part_t max_num_part_along_dim ; // maximum part count along a dimension.
482  mj_part_t max_num_cut_along_dim; // maximum cut count along a dimension.
483 
484  // maximum part+cut count along a dimension.
485  size_t max_num_total_part_along_dim;
486 
487  mj_part_t total_dim_num_reduce_all; // estimate on #reduceAlls can be done.
488 
489  // max no of parts that might occur during the partition before the last
490  // partitioning dimension.
491  mj_part_t last_dim_num_part;
492 
493  // input part array specifying num part to divide along each dim.
494  Kokkos::View<mj_part_t *, Kokkos::HostSpace> part_no_array;
495 
496  // two dimension coordinate array
497  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
498  Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t>
499  mj_coordinates;
500 
501  // two dimension weight array
502  Kokkos::View<mj_scalar_t **, device_t> mj_weights;
503 
504  // if the target parts are uniform
505  Kokkos::View<bool *, Kokkos::HostSpace> mj_uniform_parts;
506 
507  // if the coordinates have uniform weights
508  Kokkos::View<bool *, Kokkos::HostSpace> mj_uniform_weights;
509 
510  int mj_num_teams; // the number of teams
511 
512  size_t num_global_parts; // the targeted number of parts
513 
514  // vector of all boxes for all parts, constructed if mj_keep_part_boxes true
515  RCP<mj_partBoxVector_t> kept_boxes;
516 
517  RCP<mj_partBox_t> global_box;
518 
519  int myRank; // processor rank
520  int myActualRank; // initial rank
521 
522  bool divide_to_prime_first;
523 
524  // initial global ids of the coordinates.
525  Kokkos::View<const mj_gno_t*, device_t> initial_mj_gnos;
526 
527  // current global ids of the coordinates, might change during migration.
528  Kokkos::View<mj_gno_t*, device_t> current_mj_gnos;
529 
530  // the actual processor owner of the coordinate, to track after migrations.
531  Kokkos::View<int*, Kokkos::HostSpace> owner_of_coordinate;
532 
533  // permutation of coordinates, for partitioning.
534  Kokkos::View<mj_lno_t*, device_t> coordinate_permutations;
535 
536  // permutation work array.
537  Kokkos::View<mj_lno_t*, device_t> new_coordinate_permutations;
538 
539  // the part ids assigned to coordinates.
540  Kokkos::View<mj_part_t*, device_t> assigned_part_ids;
541 
542  // beginning and end of each part.
543  Kokkos::View<mj_lno_t *, device_t> part_xadj;
544 
545  // work array for beginning and end of each part.
546  Kokkos::View<mj_lno_t *, device_t> new_part_xadj;
547 
548  Kokkos::View<mj_scalar_t *, device_t> all_cut_coordinates;
549 
550  // how much weight should a MPI put left side of the each cutline
551  Kokkos::View<mj_scalar_t *, device_t>
552  process_cut_line_weight_to_put_left;
553 
554  // weight percentage each thread in MPI puts left side of the each outline
555  Kokkos::View<mj_scalar_t *, device_t>
556  thread_cut_line_weight_to_put_left;
557 
558  // work array to manipulate coordinate of cutlines in different iterations.
559  // necessary because previous cut line information is used for determining
560  // the next cutline information. therefore, cannot update the cut work array
561  // until all cutlines are determined.
562  Kokkos::View<mj_scalar_t *, device_t> cut_coordinates_work_array;
563 
564  // Used for swapping above cut_coordinates_work_array
565  Kokkos::View<mj_scalar_t *, device_t> temp_cut_coords;
566 
567  // cumulative part weight array.
568  Kokkos::View<mj_scalar_t *, device_t> target_part_weights;
569 
570  // upper bound coordinate of a cut line
571  Kokkos::View<mj_scalar_t *, device_t> cut_upper_bound_coordinates;
572 
573  // lower bound coordinate of a cut line
574  Kokkos::View<mj_scalar_t *, device_t> cut_lower_bound_coordinates;
575 
576  // lower bound weight of a cut line
577  Kokkos::View<mj_scalar_t *, device_t> cut_lower_bound_weights;
578 
579  // upper bound weight of a cut line
580  Kokkos::View<mj_scalar_t *, device_t> cut_upper_bound_weights;
581 
582  // combined array to exchange the min and max coordinate, and total
583  // weight of part.
584  Kokkos::View<mj_scalar_t *, device_t>
585  process_local_min_max_coord_total_weight;
586 
587  // global combined array with the results for min, max and total weight.
588  Kokkos::View<mj_scalar_t *, device_t>
589  global_min_max_coord_total_weight;
590 
591  // isDone is used to determine if a cutline is determined already. If a cut
592  // line is already determined, the next iterations will skip this cut line.
593  Kokkos::View<bool *, device_t> is_cut_line_determined;
594 
595  // incomplete_cut_count count holds the number of cutlines that have not
596  // been finalized for each part when concurrentPartCount>1, using this
597  // information, if incomplete_cut_count[x]==0, then no work is done
598  // for this part.
599  Kokkos::View<mj_part_t *, device_t> device_incomplete_cut_count;
600  typename decltype(device_incomplete_cut_count)::HostMirror
601  incomplete_cut_count;
602 
603  // Need a quick accessor for this on host
604  typename decltype (part_xadj)::HostMirror host_part_xadj;
605 
606  // local part weights of each thread.
607  Kokkos::View<double *, device_t>
608  thread_part_weights;
609 
610  // the work manupulation array for partweights.
611  Kokkos::View<double *, device_t>
612  thread_part_weight_work;
613 
614  // thread_cut_left_closest_point to hold the closest coordinate
615  // to a cutline from left (for each thread).
616  Kokkos::View<mj_scalar_t *, device_t>
617  thread_cut_left_closest_point;
618 
619  // thread_cut_right_closest_point to hold the closest coordinate
620  // to a cutline from right (for each thread)
621  Kokkos::View<mj_scalar_t *, device_t>
622  thread_cut_right_closest_point;
623 
624  // to store how many points in each part a thread has.
625  Kokkos::View<mj_lno_t *, device_t>
626  thread_point_counts;
627 
628  Kokkos::View<mj_scalar_t *, device_t> process_rectilinear_cut_weight;
629  Kokkos::View<mj_scalar_t *, device_t> global_rectilinear_cut_weight;
630 
631  // for faster communication, concatanation of
632  // totalPartWeights sized 2P-1, since there are P parts and P-1 cut lines
633  // leftClosest distances sized P-1, since P-1 cut lines
634  // rightClosest distances size P-1, since P-1 cut lines.
635  Kokkos::View<mj_scalar_t *, device_t>
636  total_part_weight_left_right_closests;
637  Kokkos::View<mj_scalar_t *, device_t>
638  global_total_part_weight_left_right_closests;
639 
640  Kokkos::View<mj_part_t*, device_t> device_num_partitioning_in_current_dim;
641  typename decltype(device_num_partitioning_in_current_dim)::HostMirror
642  host_num_partitioning_in_current_dim; // for quick access on host
643 
644  /* \brief helper functio to calculate imbalance.
645  * \param achieved balance we achieved.
646  * \param expected balance expected.
647  */
648  static
649  KOKKOS_INLINE_FUNCTION
650  double calculate_imbalance(mj_scalar_t achieved, mj_scalar_t expected) {
651  return static_cast<double>(achieved) / static_cast<double>(expected) - 1.0;
652  }
653 
654  /* \brief Either the mj array (part_no_array) or num_global_parts should be
655  * provided in the input. part_no_array takes precedence if both are
656  * provided. Depending on these parameters, total cut/part number, maximum
657  * part/cut number along a dimension, estimated number of reduceAlls,
658  * and the number of parts before the last dimension is calculated.
659  * */
660  void set_part_specifications();
661 
662  /* \brief Tries to determine the part number for current dimension,
663  * by trying to make the partitioning as square as possible.
664  * \param num_total_future how many more partitionings are required.
665  * \param root how many more recursion depth is left.
666  */
667  inline mj_part_t get_part_count(
668  mj_part_t num_total_future,
669  double root);
670 
671  /* \brief for part communication we keep track of the box boundaries.
672  * This is performed when either asked specifically, or when geometric
673  * mapping is performed afterwards. This function initializes a single box
674  * with all global min and max coordinates.
675  * \param initial_partitioning_boxes the input and output vector for boxes.
676  */
677  void init_part_boxes(RCP<mj_partBoxVector_t> & outPartBoxes);
678 
679  /* \brief Function returns how many parts that will be obtained after this
680  * dimension partitioning. It sets how many parts each current part will be
681  * partitioned into in this dimension to device_num_partitioning_in_current_dim
682  * vector, sets how many total future parts each obtained part will be
683  * partitioned into in next_future_num_parts_in_parts vector, If part boxes
684  * are kept, then sets initializes the output_part_boxes as its ancestor.
685  * \param future_num_part_in_parts: input, how many future parts each
686  * current part will be partitioned into.
687  * \param next_future_num_parts_in_parts: output, how many future parts
688  * each obtained part will be partitioned into.
689  * \param future_num_parts: output, max number of future parts that will be
690  * obtained from a single
691  * \param current_num_parts: input, how many parts are there currently.
692  * \param current_iteration: input, current dimension iteration number.
693  * \param input_part_boxes: input, if boxes are kept, current boxes.
694  * \param output_part_boxes: output, if boxes are kept, the initial box
695  * boundaries for obtained parts.
696  * \param atomic_part_count // DOCWORK: Documentation
697  */
698  mj_part_t update_part_num_arrays(
699  std::vector<mj_part_t> *future_num_part_in_parts,
700  std::vector<mj_part_t> *next_future_num_parts_in_parts,
701  mj_part_t &future_num_parts,
702  mj_part_t current_num_parts,
703  int current_iteration,
704  RCP<mj_partBoxVector_t> input_part_boxes,
705  RCP<mj_partBoxVector_t> output_part_boxes,
706  mj_part_t atomic_part_count);
707 
719  static
720  KOKKOS_INLINE_FUNCTION
721  void mj_calculate_new_cut_position (
722  mj_scalar_t cut_upper_bound,
723  mj_scalar_t cut_lower_bound,
724  mj_scalar_t cut_upper_weight,
725  mj_scalar_t cut_lower_weight,
726  mj_scalar_t expected_weight,
727  mj_scalar_t &new_cut_position,
728  mj_scalar_t sEpsilon);
729 
754  bool mj_perform_migration(
755  mj_part_t in_num_parts, //current number of parts
756  mj_part_t &out_num_parts, //output number of parts.
757  std::vector<mj_part_t> *next_future_num_parts_in_parts,
758  mj_part_t &output_part_begin_index,
759  size_t migration_reduce_all_population,
760  mj_lno_t num_coords_for_last_dim_part,
761  std::string iteration,
762  RCP<mj_partBoxVector_t> &input_part_boxes,
763  RCP<mj_partBoxVector_t> &output_part_boxes);
764 
782  bool mj_check_to_migrate(
783  size_t migration_reduce_all_population,
784  mj_lno_t num_coords_for_last_dim_part,
785  mj_part_t num_procs,
786  mj_part_t num_parts,
787  mj_gno_t *num_points_in_all_processor_parts);
788 
813  void mj_migration_part_proc_assignment(
814  mj_gno_t * num_points_in_all_processor_parts,
815  mj_part_t num_parts,
816  mj_part_t num_procs,
817  mj_lno_t *send_count_to_each_proc,
818  std::vector<mj_part_t> &processor_ranks_for_subcomm,
819  std::vector<mj_part_t> *next_future_num_parts_in_parts,
820  mj_part_t &out_num_part,
821  std::vector<mj_part_t> &out_part_indices,
822  mj_part_t &output_part_numbering_begin_index,
823  int *coordinate_destinations);
824 
850  void mj_assign_proc_to_parts(
851  mj_gno_t * num_points_in_all_processor_parts,
852  mj_part_t num_parts,
853  mj_part_t num_procs,
854  mj_lno_t *send_count_to_each_proc,
855  std::vector<mj_part_t> &processor_ranks_for_subcomm,
856  std::vector<mj_part_t> *next_future_num_parts_in_parts,
857  mj_part_t &out_part_index,
858  mj_part_t &output_part_numbering_begin_index,
859  int *coordinate_destinations);
860 
876  void assign_send_destinations(
877  mj_part_t num_parts,
878  mj_part_t *part_assignment_proc_begin_indices,
879  mj_part_t *processor_chains_in_parts,
880  mj_lno_t *send_count_to_each_proc,
881  int *coordinate_destinations);
882 
897  void assign_send_destinations2(
898  mj_part_t num_parts,
899  uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment,
900  int *coordinate_destinations,
901  mj_part_t &output_part_numbering_begin_index,
902  std::vector<mj_part_t> *next_future_num_parts_in_parts);
903 
926  void mj_assign_parts_to_procs(
927  mj_gno_t * num_points_in_all_processor_parts,
928  mj_part_t num_parts,
929  mj_part_t num_procs,
930  mj_lno_t *send_count_to_each_proc,
931  std::vector<mj_part_t> *next_future_num_parts_in_parts,
932  mj_part_t &out_num_part,
933  std::vector<mj_part_t> &out_part_indices,
934  mj_part_t &output_part_numbering_begin_index,
935  int *coordinate_destinations);
936 
950  void mj_migrate_coords(
951  mj_part_t num_procs,
952  mj_lno_t &num_new_local_points,
953  std::string iteration,
954  int *coordinate_destinations,
955  mj_part_t num_parts);
956 
962  void create_sub_communicator(
963  std::vector<mj_part_t> &processor_ranks_for_subcomm);
964 
969  mj_part_t find_largest_prime_factor(mj_part_t num_parts) {
970  mj_part_t largest_factor = 1;
971  mj_part_t n = num_parts;
972  mj_part_t divisor = 2;
973  while (n > 1) {
974  while (n % divisor == 0) {
975  n = n / divisor;
976  largest_factor = divisor;
977  }
978  ++divisor;
979  if(divisor * divisor > n) {
980  if(n > 1) {
981  largest_factor = n;
982  }
983  break;
984  }
985  }
986  return largest_factor;
987  }
988 
989 public:
990  AlgMJ();
991 
992  // DOCWORK: Make param documentation use : consistently
1018  void multi_jagged_part(
1019  const RCP<const Environment> &env,
1020  RCP<const Comm<int> > &problemComm,
1021  double imbalance_tolerance,
1022  int num_teams,
1023  size_t num_global_parts,
1024  Kokkos::View<mj_part_t*, Kokkos::HostSpace> & part_no_array,
1025  int recursion_depth,
1026  int coord_dim,
1027  mj_lno_t num_local_coords,
1028  mj_gno_t num_global_coords,
1029  Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos,
1030  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
1031  Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> & mj_coordinates,
1032  int num_weights_per_coord,
1033  Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_weights,
1034  Kokkos::View<mj_scalar_t**, device_t> & mj_weights,
1035  Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_parts,
1036  Kokkos::View<mj_part_t*, device_t> & result_assigned_part_ids,
1037  Kokkos::View<mj_gno_t*, device_t> & result_mj_gnos);
1038 
1052  bool distribute_points_on_cut_lines_,
1053  int max_concurrent_part_calculation_,
1054  int check_migrate_avoid_migration_option_,
1055  double minimum_migration_imbalance_,
1056  int migration_type_ = 0);
1057 
1060  void set_to_keep_part_boxes();
1061 
1064  RCP<mj_partBox_t> get_global_box() const;
1065 
1068  RCP<mj_partBoxVector_t> get_kept_boxes() const;
1069 
1072  RCP<mj_partBoxVector_t> compute_global_box_boundaries(
1073  RCP<mj_partBoxVector_t> &localPartBoxes) const;
1074 
1114  const RCP<const Environment> &env,
1115  mj_lno_t num_total_coords,
1116  mj_lno_t num_selected_coords,
1117  size_t num_target_part,
1118  int coord_dim,
1119  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
1120  Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t> & mj_coordinates_,
1121  Kokkos::View<mj_lno_t *, device_t> &
1122  initial_selected_coords_output_permutation,
1123  mj_lno_t *output_xadj,
1124  int recursion_depth_,
1125  const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & part_no_array,
1126  bool partition_along_longest_dim,
1127  int num_ranks_per_node,
1128  bool divide_to_prime_first_,
1129  mj_part_t num_first_level_parts_ = 1,
1130  const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & first_level_distribution_
1131  = Kokkos::View<mj_part_t *, Kokkos::HostSpace>());
1132 
1133 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
1134  public:
1135 #else
1136  private:
1137 #endif
1138 
1139  /* \brief Allocates all required memory for the mj partitioning algorithm.
1140  */
1141  void allocate_set_work_memory();
1142 
1143  /* \brief compute global bounding box: min/max coords of global domain */
1144  void compute_global_box();
1145 
1146  // DOCWORK: Inconsisent use of ! for descriptive/brief commenting - decide.
1153  void mj_get_local_min_max_coord_totW(
1154  mj_part_t current_work_part,
1155  mj_part_t current_concurrent_num_parts,
1156  Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords);
1157 
1170  void mj_get_global_min_max_coord_totW(
1171  mj_part_t current_concurrent_num_parts,
1172  Kokkos::View<mj_scalar_t *, device_t> & local_min_max_total,
1173  Kokkos::View<mj_scalar_t *, device_t> & global_min_max_total);
1174 
1205  void mj_get_initial_cut_coords_target_weights(
1206  mj_scalar_t min_coord,
1207  mj_scalar_t max_coord,
1208  mj_part_t num_cuts/*p-1*/ ,
1209  mj_scalar_t global_weight,
1210  Kokkos::View<mj_scalar_t *, device_t> & initial_cut_coords,
1211  Kokkos::View<mj_scalar_t *, device_t> & target_part_weights,
1212  std::vector <mj_part_t> *future_num_part_in_parts,
1213  std::vector <mj_part_t> *next_future_num_parts_in_parts,
1214  mj_part_t concurrent_current_part,
1215  mj_part_t obtained_part_index,
1216  mj_part_t num_target_first_level_parts = 1,
1217  const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & target_first_level_dist =
1218  Kokkos::View<mj_part_t *, Kokkos::HostSpace>());
1219 
1236  void set_initial_coordinate_parts(
1237  mj_scalar_t &max_coordinate,
1238  mj_scalar_t &min_coordinate,
1239  mj_lno_t coordinate_begin_index,
1240  mj_lno_t coordinate_end_index,
1241  Kokkos::View<mj_lno_t *, device_t> &
1242  mj_current_coordinate_permutations,
1243  Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1244  Kokkos::View<mj_part_t *, device_t> & mj_part_ids,
1245  mj_part_t &partition_count);
1246 
1263  void mj_1D_part(
1264  Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1265  double imbalanceTolerance,
1266  mj_part_t current_work_part,
1267  mj_part_t current_concurrent_num_parts,
1268  Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
1269  mj_part_t total_incomplete_cut_count,
1270  Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count,
1271  Kokkos::View<size_t*, device_t> & view_total_reduction_size);
1272 
1278  void mj_1D_part_get_part_weights(
1279  mj_part_t current_concurrent_num_parts,
1280  mj_part_t current_work_part,
1281  Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1282  int loop_count);
1283 
1291  void mj_combine_rightleft_and_weights(
1292  mj_part_t current_work_part,
1293  mj_part_t current_concurrent_num_parts);
1294 
1307  void mj_create_new_partitions(
1308  mj_part_t num_parts,
1309  mj_part_t current_concurrent_work_part,
1310  Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1311  Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
1312  Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
1313  Kokkos::View<mj_lno_t *, device_t> & out_part_xadj);
1314 
1350  void mj_get_new_cut_coordinates(
1351  mj_part_t current_concurrent_num_parts,
1352  mj_part_t kk,
1353  const mj_part_t &num_cuts,
1354  const double &used_imbalance_tolerance,
1355  Kokkos::View<mj_scalar_t *, device_t> & current_global_part_weights,
1356  Kokkos::View<mj_scalar_t *, device_t> & current_local_part_weights,
1357  Kokkos::View<mj_scalar_t *, device_t> & current_part_target_weights,
1358  Kokkos::View<bool *, device_t> & current_cut_line_determined,
1359  Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
1360  Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_bounds,
1361  Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bounds,
1362  Kokkos::View<mj_scalar_t *, device_t> & current_global_left_closest_points,
1363  Kokkos::View<mj_scalar_t *, device_t> & current_global_right_closest_points,
1364  Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bound_weights,
1365  Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_weights,
1366  Kokkos::View<mj_scalar_t *, device_t> & new_current_cut_coordinates,
1367  Kokkos::View<mj_scalar_t *, device_t> &
1368  current_part_cut_line_weight_to_put_left,
1369  Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count);
1370 
1380  void get_processor_num_points_in_parts(
1381  mj_part_t num_procs,
1382  mj_part_t num_parts,
1383  mj_gno_t *&num_points_in_all_processor_parts);
1384 
1389  void fill_permutation_array(
1390  mj_part_t output_num_parts,
1391  mj_part_t num_parts);
1392 
1414  void create_consistent_chunks(
1415  mj_part_t num_parts,
1416  Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1417  Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
1418  mj_lno_t coordinate_begin,
1419  mj_lno_t coordinate_end,
1420  Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
1421  Kokkos::View<mj_lno_t *, device_t> & out_part_xadj,
1422  int coordInd,
1423  bool longest_dim_part,
1424  uSignedSortItem<int, mj_scalar_t, char> *p_coord_dimension_range_sorted);
1425 
1434  void set_final_parts(
1435  mj_part_t current_num_parts,
1436  mj_part_t output_part_begin_index,
1437  RCP<mj_partBoxVector_t> &output_part_boxes,
1438  bool is_data_ever_migrated);
1439 };
1440 
1443 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
1444  typename mj_part_t, typename mj_node_t>
1446  mj_env(), mj_problemComm(), comm(), imbalance_tolerance(0),
1447  recursion_depth(0), coord_dim(0),
1448  num_weights_per_coord(0), initial_num_loc_coords(0),
1449  initial_num_glob_coords(0),
1450  num_local_coords(0), num_global_coords(0),
1451  sEpsilon(std::numeric_limits<mj_scalar_t>::epsilon() * 100),
1452  distribute_points_on_cut_lines(true),
1453  max_concurrent_part_calculation(1),
1454  mj_run_as_rcb(false), mj_user_recursion_depth(0),
1455  mj_keep_part_boxes(false),
1456  check_migrate_avoid_migration_option(0), migration_type(0),
1457  minimum_migration_imbalance(0.30),
1458  num_first_level_parts(1),
1459  total_num_cut(0), total_num_part(0), max_num_part_along_dim(0),
1460  max_num_cut_along_dim(0),
1461  max_num_total_part_along_dim(0),
1462  total_dim_num_reduce_all(0),
1463  last_dim_num_part(0),
1464  mj_num_teams(0),
1465  num_global_parts(1),
1466  kept_boxes(), global_box(),
1467  myRank(0), myActualRank(0),
1468  divide_to_prime_first(false)
1469 {
1470 }
1471 
1515 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
1516  typename mj_part_t, typename mj_node_t>
1519  const RCP<const Environment> &env,
1520  mj_lno_t num_total_coords,
1521  mj_lno_t num_selected_coords,
1522  size_t num_target_part,
1523  int coord_dim_,
1524  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
1525  Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t> &
1526  mj_coordinates_,
1527  Kokkos::View<mj_lno_t *, device_t> & initial_adjList_output_adjlist,
1528  mj_lno_t *output_xadj,
1529  int recursion_depth_,
1530  const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & part_no_array_,
1531  bool partition_along_longest_dim,
1532  int num_ranks_per_node,
1533  bool divide_to_prime_first_,
1534  mj_part_t num_first_level_parts_,
1535  const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & first_level_distribution_)
1536 {
1537  this->mj_env = env;
1538  const RCP<Comm<int> > commN;
1539  this->mj_problemComm = Teuchos::DefaultComm<int>::getDefaultSerialComm(commN);
1540  this->comm = Teuchos::rcp_const_cast<Comm<int> >(this->mj_problemComm);
1541  this->myActualRank = this->myRank = 1;
1542 
1543  this->divide_to_prime_first = divide_to_prime_first_;
1544  //weights are uniform for task mapping
1545 
1546  //parts are uniform for task mapping
1547  //as input indices.
1548  this->imbalance_tolerance = 0;
1549  this->num_global_parts = num_target_part;
1550  this->part_no_array = part_no_array_;
1551  this->recursion_depth = recursion_depth_;
1552 
1553  // If nonuniform first level partitioning, the requested num of parts and the
1554  // requested distribution of elements for each part
1555  this->num_first_level_parts = num_first_level_parts_;
1556 
1557  this->first_level_distribution = first_level_distribution_;
1558 
1559  this->coord_dim = coord_dim_;
1560  this->num_local_coords = num_total_coords;
1561 
1562  this->num_global_coords = num_total_coords;
1563  this->mj_coordinates = mj_coordinates_;
1564 
1565 
1566  this->initial_mj_gnos =
1567  Kokkos::View<mj_gno_t*, device_t>("gids", this->num_local_coords);
1568 
1569  this->num_weights_per_coord = 0;
1570 
1571  this->mj_uniform_weights = Kokkos::View<bool*, Kokkos::HostSpace>(
1572  "uniform weights", 1);
1573  this->mj_uniform_weights(0) = true;
1574 
1575  this->mj_weights = Kokkos::View<mj_scalar_t**, device_t>
1576  ("weights", 1, 1);
1577 
1578  this->mj_uniform_parts =
1579  Kokkos::View<bool*, Kokkos::HostSpace>("uniform parts", 1);
1580  this->mj_uniform_parts(0) = true;
1581 
1582  this->set_part_specifications();
1583 
1584  this->allocate_set_work_memory();
1585 
1586  // Do single init
1587  auto local_part_xadj = this->part_xadj;
1588  Kokkos::parallel_for(
1589  Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
1590  KOKKOS_LAMBDA (int dummy) {
1591  local_part_xadj(0) = static_cast<mj_lno_t>(num_selected_coords);
1592  });
1593 
1594  Kokkos::deep_copy(coordinate_permutations, initial_adjList_output_adjlist);
1595 
1596  mj_part_t current_num_parts = 1;
1597 
1598  Kokkos::View<mj_scalar_t *, device_t> current_cut_coordinates =
1599  this->all_cut_coordinates;
1600 
1601  mj_part_t future_num_parts = this->total_num_part;
1602 
1603  std::vector<mj_part_t> *future_num_part_in_parts =
1604  new std::vector<mj_part_t>();
1605  std::vector<mj_part_t> *next_future_num_parts_in_parts =
1606  new std::vector<mj_part_t>();
1607  next_future_num_parts_in_parts->push_back(this->num_global_parts);
1608  RCP<mj_partBoxVector_t> t1;
1609  RCP<mj_partBoxVector_t> t2;
1610 
1611  std::vector <uSignedSortItem<int, mj_scalar_t, char>>
1612  coord_dimension_range_sorted(this->coord_dim);
1613  uSignedSortItem<int, mj_scalar_t, char> *p_coord_dimension_range_sorted =
1614  &(coord_dimension_range_sorted[0]);
1615  std::vector <mj_scalar_t> coord_dim_mins(this->coord_dim);
1616  std::vector <mj_scalar_t> coord_dim_maxs(this->coord_dim);
1617 
1618  // Need a device counter - how best to allocate?
1619  // Putting this allocation in the loops is very costly so moved out here.
1620  Kokkos::View<mj_part_t*, device_t>
1621  view_rectilinear_cut_count("view_rectilinear_cut_count", 1);
1622  Kokkos::View<size_t*, device_t>
1623  view_total_reduction_size("view_total_reduction_size", 1);
1624 
1625  for(int rd = 0; rd < this->recursion_depth; ++rd) {
1626  // next_future_num_parts_in_parts will be as the size of outnumParts,
1627  // and this will hold how many more parts that each output part
1628  // should be divided. this array will also be used to determine the weight
1629  // ratios of the parts.
1630  // swap the arrays to use iteratively..
1631  std::vector<mj_part_t> *tmpPartVect = future_num_part_in_parts;
1632  future_num_part_in_parts = next_future_num_parts_in_parts;
1633  next_future_num_parts_in_parts = tmpPartVect;
1634 
1635  // clear next_future_num_parts_in_parts array as
1636  // getPartitionArrays expects it to be empty.
1637  next_future_num_parts_in_parts->clear();
1638 
1639  // returns the total number of output parts for this dimension partitioning.
1640  mj_part_t output_part_count_in_dimension =
1641  this->update_part_num_arrays(
1642  future_num_part_in_parts,
1643  next_future_num_parts_in_parts,
1644  future_num_parts,
1645  current_num_parts,
1646  rd,
1647  t1,
1648  t2, num_ranks_per_node);
1649 
1650  // if the number of obtained parts equal to current number of parts,
1651  // skip this dimension. For example, this happens when 1 is given in
1652  // the input part array is given. P=4,5,1,2
1653  if(output_part_count_in_dimension == current_num_parts) {
1654  tmpPartVect = future_num_part_in_parts;
1655  future_num_part_in_parts = next_future_num_parts_in_parts;
1656  next_future_num_parts_in_parts = tmpPartVect;
1657  continue;
1658  }
1659 
1660  //convert i to string to be used for debugging purposes.
1661  std::string istring = std::to_string(rd);
1662 
1663  // alloc Memory to point the indices
1664  // of the parts in the permutation array.
1665  this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
1666  "new part xadj", output_part_count_in_dimension);
1667 
1668  // the index where in the outtotalCounts will be written.
1669 
1670  mj_part_t output_part_index = 0;
1671 
1672  // whatever is written to outTotalCounts will be added with previousEnd
1673  // so that the points will be shifted.
1674  mj_part_t output_coordinate_end_index = 0;
1675 
1676  mj_part_t current_work_part = 0;
1677  mj_part_t current_concurrent_num_parts = 1;
1678 
1679  mj_part_t obtained_part_index = 0;
1680 
1681  // get the coordinate axis along which the partitioning will be done.
1682  int coordInd = rd % this->coord_dim;
1683 
1684  Kokkos::View<mj_scalar_t *, device_t> mj_current_dim_coords =
1685  Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
1686 
1687  auto host_process_local_min_max_coord_total_weight =
1688  Kokkos::create_mirror_view(process_local_min_max_coord_total_weight);
1689  auto host_global_min_max_coord_total_weight =
1690  Kokkos::create_mirror_view(global_min_max_coord_total_weight);
1691 
1692  // run for all available parts.
1693  for(; current_work_part < current_num_parts;
1694  current_work_part += current_concurrent_num_parts) {
1695 
1696  mj_part_t actual_work_part_count = 0;
1697 
1698  // initialization for 1D partitioning.
1699  // get the min and max coordinates of each part
1700  // together with the part weights of each part.
1701  for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
1702  mj_part_t current_work_part_in_concurrent_parts =
1703  current_work_part + kk;
1704 
1705  // if this part wont be partitioned any further
1706  // dont do any work for this part.
1707  mj_part_t partition_count = host_num_partitioning_in_current_dim(
1708  current_work_part_in_concurrent_parts);
1709  if(partition_count == 1) {
1710  continue;
1711  }
1712  ++actual_work_part_count;
1713  if(partition_along_longest_dim) {
1714  auto local_process_local_min_max_coord_total_weight =
1715  this->process_local_min_max_coord_total_weight;
1716  for(int coord_traverse_ind = 0;
1717  coord_traverse_ind < this->coord_dim; ++coord_traverse_ind) {
1718 
1719  Kokkos::View<mj_scalar_t *, device_t> coords =
1720  Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coord_traverse_ind);
1721 
1722  this->mj_get_local_min_max_coord_totW(
1723  current_work_part,
1724  current_concurrent_num_parts,
1725  coords);
1726 
1727  coord_dimension_range_sorted[coord_traverse_ind].id =
1728  coord_traverse_ind;
1729  coord_dimension_range_sorted[coord_traverse_ind].signbit = 1;
1730 
1731  Kokkos::deep_copy(host_process_local_min_max_coord_total_weight,
1732  process_local_min_max_coord_total_weight);
1733 
1734  coord_dim_mins[coord_traverse_ind] =
1735  host_process_local_min_max_coord_total_weight(kk);
1736  coord_dim_maxs[coord_traverse_ind] =
1737  host_process_local_min_max_coord_total_weight(
1738  kk + current_concurrent_num_parts);
1739  coord_dimension_range_sorted[coord_traverse_ind].val =
1740  host_process_local_min_max_coord_total_weight(
1741  kk + current_concurrent_num_parts) -
1742  host_process_local_min_max_coord_total_weight(kk);
1743  }
1744 
1745  uqSignsort(this->coord_dim, p_coord_dimension_range_sorted);
1746  coordInd = p_coord_dimension_range_sorted[this->coord_dim - 1].id;
1747  auto set_min = coord_dim_mins[coordInd];
1748  auto set_max = coord_dim_maxs[coordInd];
1749  Kokkos::parallel_for(
1750  Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
1751  (0, 1), KOKKOS_LAMBDA (int dummy) {
1752  local_process_local_min_max_coord_total_weight(kk) = set_min;
1753  local_process_local_min_max_coord_total_weight(
1754  kk + current_concurrent_num_parts) = set_max;
1755  });
1756 
1757  mj_current_dim_coords =
1758  Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
1759  }
1760  else {
1761  Kokkos::View<mj_scalar_t *, device_t> coords =
1762  Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
1763  this->mj_get_local_min_max_coord_totW(
1764  current_work_part,
1765  current_concurrent_num_parts,
1766  coords);
1767  }
1768  }
1769 
1770  // 1D partitioning
1771  if(actual_work_part_count > 0) {
1772  // obtain global Min max of the part.
1773  this->mj_get_global_min_max_coord_totW(
1774  current_concurrent_num_parts,
1775  this->process_local_min_max_coord_total_weight,
1776  this->global_min_max_coord_total_weight);
1777 
1778  // update host copy
1779  Kokkos::deep_copy(host_global_min_max_coord_total_weight,
1780  global_min_max_coord_total_weight);
1781 
1782  // represents the total number of cutlines
1783  // whose coordinate should be determined.
1784  mj_part_t total_incomplete_cut_count = 0;
1785 
1786  //Compute weight ratios for parts & cuts:
1787  //e.g., 0.25 0.25 0.5 0.5 0.75 0.75 1.0
1788  // part0 cut0 part1 cut1 part2 cut2 part3
1789  mj_part_t concurrent_part_cut_shift = 0;
1790  mj_part_t concurrent_part_part_shift = 0;
1791  for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
1792  mj_scalar_t min_coordinate =
1793  host_global_min_max_coord_total_weight(kk);
1794  mj_scalar_t max_coordinate = host_global_min_max_coord_total_weight(
1795  kk + current_concurrent_num_parts);
1796  mj_scalar_t global_total_weight = host_global_min_max_coord_total_weight(
1797  kk + 2*current_concurrent_num_parts);
1798 
1799  mj_part_t concurrent_current_part_index = current_work_part + kk;
1800 
1801  mj_part_t partition_count = host_num_partitioning_in_current_dim(
1802  concurrent_current_part_index);
1803 
1804  Kokkos::View<mj_scalar_t *, device_t> usedCutCoordinate =
1805  Kokkos::subview(current_cut_coordinates,
1806  std::pair<mj_lno_t, mj_lno_t>(
1807  concurrent_part_cut_shift,
1808  current_cut_coordinates.size()));
1809  Kokkos::View<mj_scalar_t *, device_t>
1810  current_target_part_weights =
1811  Kokkos::subview(target_part_weights,
1812  std::pair<mj_lno_t, mj_lno_t>(
1813  concurrent_part_part_shift,
1814  target_part_weights.size()));
1815 
1816  // shift the usedCutCoordinate array as noCuts.
1817  concurrent_part_cut_shift += partition_count - 1;
1818  // shift the partRatio array as noParts.
1819  concurrent_part_part_shift += partition_count;
1820  // calculate only if part is not empty,
1821  // and part will be further partitioend.
1822  if(partition_count > 1 && min_coordinate <= max_coordinate) {
1823  // increase allDone by the number of cuts of the current
1824  // part's cut line number.
1825  total_incomplete_cut_count += partition_count - 1;
1826 
1827  this->incomplete_cut_count(kk) = partition_count - 1;
1828 
1829  // When num_first_level_parts != 1 we have
1830  // nonuniform partitioning on the first level, providing
1831  // requested number of parts (num_first_level_parts) and
1832  // requested distribution in parts (first_level_distribution)
1833 
1834  // Get the target part weights given a desired distribution
1835  this->mj_get_initial_cut_coords_target_weights(
1836  min_coordinate,
1837  max_coordinate,
1838  partition_count - 1,
1839  global_total_weight,
1840  usedCutCoordinate,
1841  current_target_part_weights,
1842  future_num_part_in_parts,
1843  next_future_num_parts_in_parts,
1844  concurrent_current_part_index,
1845  obtained_part_index,
1846  rd == 0 ? this->num_first_level_parts : 1,
1847  this->first_level_distribution);
1848 
1849  mj_lno_t coordinate_end_index =
1850  host_part_xadj(concurrent_current_part_index);
1851  mj_lno_t coordinate_begin_index =
1852  (concurrent_current_part_index==0) ? 0 :
1853  host_part_xadj[concurrent_current_part_index - 1];
1854 
1855  // get the initial estimated part assignments of the coordinates.
1856  this->set_initial_coordinate_parts(
1857  max_coordinate,
1858  min_coordinate,
1859  coordinate_begin_index, coordinate_end_index,
1860  this->coordinate_permutations,
1861  mj_current_dim_coords,
1862  this->assigned_part_ids,
1863  partition_count);
1864  }
1865  else {
1866  // e.g., if have fewer coordinates than parts, don't need to do
1867  // next dim.
1868  this->incomplete_cut_count(kk) = 0;
1869  }
1870  obtained_part_index += partition_count;
1871  }
1872 
1873  // used imbalance, it is always 0, as it is difficult
1874  // to estimate a range.
1875  double used_imbalance = 0;
1876 
1877  // Determine cut lines for k parts here.
1878  this->mj_env->timerStart(MACRO_TIMERS,
1879  mj_timer_base_string + "mj_1D_part()");
1880 
1881  this->mj_1D_part(
1882  mj_current_dim_coords,
1883  used_imbalance,
1884  current_work_part,
1885  current_concurrent_num_parts,
1886  current_cut_coordinates,
1887  total_incomplete_cut_count,
1888  view_rectilinear_cut_count,
1889  view_total_reduction_size);
1890 
1891  this->mj_env->timerStop(MACRO_TIMERS,
1892  mj_timer_base_string + "mj_1D_part()");
1893  }
1894  else {
1895  obtained_part_index += current_concurrent_num_parts;
1896  }
1897  // create part chunks
1898  {
1899  mj_part_t output_array_shift = 0;
1900  mj_part_t cut_shift = 0;
1901  size_t tlr_shift = 0;
1902  size_t partweight_array_shift = 0;
1903 
1904  for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
1905  mj_part_t current_concurrent_work_part = current_work_part + kk;
1906 
1907  mj_part_t num_parts = host_num_partitioning_in_current_dim(
1908  current_concurrent_work_part);
1909 
1910  // if the part is empty, skip the part.
1911  int coordinateA_bigger_than_coordinateB =
1912  host_global_min_max_coord_total_weight(kk) >
1913  host_global_min_max_coord_total_weight(
1914  kk + current_concurrent_num_parts);
1915 
1916  if((num_parts != 1) && coordinateA_bigger_than_coordinateB) {
1917  // we still need to write the begin and end point of the empty part.
1918  // simply set it zero, the array indices will be shifted later
1919  auto local_new_part_xadj = this->new_part_xadj;
1920  Kokkos::parallel_for(
1921  Kokkos::RangePolicy<typename mj_node_t::execution_space,
1922  mj_part_t> (0, num_parts), KOKKOS_LAMBDA(mj_part_t jj) {
1923  local_new_part_xadj(
1924  output_part_index + output_array_shift + jj) = 0;
1925  });
1926 
1927  cut_shift += num_parts - 1;
1928  tlr_shift += (4 *(num_parts - 1) + 1);
1929  output_array_shift += num_parts;
1930  partweight_array_shift += (2 * (num_parts - 1) + 1);
1931  continue;
1932  }
1933  mj_lno_t coordinate_end =
1934  host_part_xadj(current_concurrent_work_part);
1935  mj_lno_t coordinate_begin =
1936  current_concurrent_work_part==0 ? 0 :
1937  host_part_xadj(current_concurrent_work_part-1);
1938 
1939  Kokkos::View<mj_scalar_t *, device_t>
1940  current_concurrent_cut_coordinate =
1941  Kokkos::subview(current_cut_coordinates,
1942  std::pair<mj_lno_t, mj_lno_t>(
1943  cut_shift,
1944  current_cut_coordinates.size()));
1945  Kokkos::View<mj_scalar_t *, device_t>
1946  used_local_cut_line_weight_to_left =
1947  Kokkos::subview(process_cut_line_weight_to_put_left,
1948  std::pair<mj_lno_t, mj_lno_t>(
1949  cut_shift,
1950  process_cut_line_weight_to_put_left.size()));
1951 
1952  this->thread_part_weight_work =
1953  Kokkos::subview(
1954  this->thread_part_weights,
1955  std::pair<mj_lno_t, mj_lno_t>(
1956  partweight_array_shift,
1957  this->thread_part_weights.size()));
1958 
1959  if(num_parts > 1) {
1960  // Rewrite the indices based on the computed cuts.
1961  Kokkos::View<mj_lno_t *, device_t> subview_new_part_xadj =
1962  Kokkos::subview(this->new_part_xadj,
1963  std::pair<mj_lno_t, mj_lno_t>(
1964  output_part_index + output_array_shift,
1965  this->new_part_xadj.size()));
1966 
1967  this->create_consistent_chunks(
1968  num_parts,
1969  mj_current_dim_coords,
1970  current_concurrent_cut_coordinate,
1971  coordinate_begin,
1972  coordinate_end,
1973  used_local_cut_line_weight_to_left,
1974  subview_new_part_xadj,
1975  coordInd,
1976  partition_along_longest_dim,
1977  p_coord_dimension_range_sorted);
1978  }
1979  else {
1980  // if this part is partitioned into 1 then just copy
1981  // the old values.
1982  mj_lno_t part_size = coordinate_end - coordinate_begin;
1983 
1984  auto local_new_part_xadj = this->new_part_xadj;
1985  Kokkos::parallel_for(
1986  Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
1987  (0, 1), KOKKOS_LAMBDA (int dummy) {
1988  local_new_part_xadj(output_part_index + output_array_shift)
1989  = part_size;
1990  });
1991 
1992  auto subview_new_coordinate_permutations =
1993  Kokkos::subview(this->new_coordinate_permutations,
1994  std::pair<mj_lno_t, mj_lno_t>(
1995  coordinate_begin,
1996  coordinate_begin + part_size));
1997  auto subview_coordinate_permutations =
1998  Kokkos::subview(this->coordinate_permutations,
1999  std::pair<mj_lno_t, mj_lno_t>(
2000  coordinate_begin,
2001  coordinate_begin + part_size));
2002  Kokkos::deep_copy(subview_new_coordinate_permutations,
2003  subview_coordinate_permutations);
2004  }
2005 
2006  cut_shift += num_parts - 1;
2007  tlr_shift += (4 *(num_parts - 1) + 1);
2008  output_array_shift += num_parts;
2009  partweight_array_shift += (2 * (num_parts - 1) + 1);
2010  }
2011 
2012  // shift cut coordinates so that all cut coordinates are stored.
2013  // current_cut_coordinates += cutShift;
2014 
2015  // getChunks from coordinates partitioned the parts and
2016  // wrote the indices as if there were a single part.
2017  // now we need to shift the beginning indices.
2018  for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
2019  mj_part_t num_parts =
2020  host_num_partitioning_in_current_dim(current_work_part + kk);
2021  auto local_new_part_xadj = this->new_part_xadj;
2022  auto local_mj_current_dim_coords = mj_current_dim_coords;
2023  auto local_new_coordinate_permutations =
2024  new_coordinate_permutations;
2025  Kokkos::parallel_for(
2026  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t> (
2027  0, num_parts), KOKKOS_LAMBDA (mj_part_t ii) {
2028  //shift it by previousCount
2029  local_new_part_xadj(output_part_index+ii) +=
2030  output_coordinate_end_index;
2031 
2032  if(ii % 2 == 1) {
2033  mj_lno_t coordinate_end =
2034  local_new_part_xadj(output_part_index+ii);
2035  mj_lno_t coordinate_begin =
2036  local_new_part_xadj(output_part_index);
2037 
2038  for(mj_lno_t task_traverse = coordinate_begin;
2039  task_traverse < coordinate_end; ++task_traverse) {
2040  mj_lno_t l = local_new_coordinate_permutations(task_traverse);
2041  //MARKER: FLIPPED ZORDER BELOW
2042  local_mj_current_dim_coords(l) = -local_mj_current_dim_coords(l);
2043  }
2044  }
2045  });
2046 
2047  // increase the previous count by current end.
2048  mj_part_t get_single;
2049  Kokkos::parallel_reduce("Read new_part_xadj",
2050  Kokkos::RangePolicy<typename mj_node_t::execution_space, int>(0, 1),
2051  KOKKOS_LAMBDA(int dummy, mj_part_t & set_single) {
2052  set_single = local_new_part_xadj(output_part_index + num_parts - 1);
2053  }, get_single);;
2054 
2055  output_coordinate_end_index = get_single;
2056  // increase the current out.
2057  output_part_index += num_parts;
2058  }
2059  }
2060  }
2061 
2062  // end of this partitioning dimension
2063  // set the current num parts for next dim partitioning
2064  current_num_parts = output_part_count_in_dimension;
2065 
2066  //swap the coordinate permutations for the next dimension.
2067  Kokkos::View<mj_lno_t *, device_t> tmp = this->coordinate_permutations;
2068  this->coordinate_permutations = this->new_coordinate_permutations;
2069  this->new_coordinate_permutations = tmp;
2070 
2071  this->part_xadj = this->new_part_xadj;
2072  this->host_part_xadj = Kokkos::create_mirror_view(part_xadj);
2073  Kokkos::deep_copy(host_part_xadj, part_xadj); // keep in sync
2074  this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>("empty", 0);
2075  }
2076 
2077  Kokkos::deep_copy(initial_adjList_output_adjlist, coordinate_permutations);
2078 
2079  // Return output_xadj in CSR format
2080  output_xadj[0] = 0;
2081  for(size_t i = 0; i < this->num_global_parts ; ++i) {
2082  output_xadj[i+1] = host_part_xadj(i);
2083  }
2084 
2085  delete future_num_part_in_parts;
2086  delete next_future_num_parts_in_parts;
2087 }
2088 
2092 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2093  typename mj_part_t, typename mj_node_t>
2094 RCP<typename AlgMJ
2095  <mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t,mj_node_t>::mj_partBox_t>
2098 {
2099  return this->global_box;
2100 }
2101 
2104 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2105  typename mj_part_t, typename mj_node_t>
2106 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
2107  mj_node_t>::set_to_keep_part_boxes()
2108 {
2109  this->mj_keep_part_boxes = true;
2110 }
2111 
2112 /* \brief Either the mj array (part_no_array) or num_global_parts should be
2113  * provided in the input. part_no_array takes
2114  * precedence if both are provided.
2115  * Depending on these parameters, total cut/part number,
2116  * maximum part/cut number along a dimension, estimated number of reduceAlls,
2117  * and the number of parts before the last dimension is calculated.
2118  * */
2119 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2120  typename mj_part_t, typename mj_node_t>
2123 {
2124  this->total_num_cut = 0; //how many cuts will be totally
2125  this->total_num_part = 1; //how many parts will be totally
2126  this->max_num_part_along_dim = 0; // maximum part count along a dimension.
2127  this->total_dim_num_reduce_all = 0; // estimate on #reduceAlls can be done.
2128  this->last_dim_num_part = 1; //max no of parts that might occur
2129  //during the partition before the
2130  //last partitioning dimension.
2131  this->max_num_cut_along_dim = 0;
2132  this->max_num_total_part_along_dim = 0;
2133 
2134  if(this->part_no_array.size()) {
2135  auto local_recursion_depth = this->recursion_depth;
2136 
2137  this->total_dim_num_reduce_all =
2138  this->total_num_part * this->recursion_depth;
2139 
2140  this->total_num_part = 1;
2141  for(int i = 0; i < local_recursion_depth; ++i) {
2142  this->total_num_part *= this->part_no_array(i);
2143  }
2144 
2145  mj_part_t track_max = 0;
2146  for(int i = 0; i < local_recursion_depth; ++i) {
2147  if(part_no_array(i) > track_max) {
2148  track_max = this->part_no_array(i);
2149  };
2150  }
2151 
2152  this->last_dim_num_part = this->total_num_part /
2153  this->part_no_array(local_recursion_depth-1);
2154 
2155  this->max_num_part_along_dim = track_max;
2156  this->num_global_parts = this->total_num_part;
2157  } else {
2158  mj_part_t future_num_parts = this->num_global_parts;
2159 
2160  // If using nonuniform first level partitioning.
2161  // initial value max_num_part_along_dim == num_first_level_parts
2162  if (this->first_level_distribution.size() != 0 &&
2163  this->num_first_level_parts > 1) {
2164  this->max_num_part_along_dim = this->num_first_level_parts;
2165  }
2166 
2167  // we need to calculate the part numbers now, to determine
2168  // the maximum along the dimensions.
2169  for(int rd = 0; rd < this->recursion_depth; ++rd) {
2170  mj_part_t maxNoPartAlongI = 0;
2171  mj_part_t nfutureNumParts = 0;
2172 
2173  // Nonuniform first level partitioning sets part specificiations for
2174  // rd == 0 only, given requested num of parts and distribution in parts
2175  // for the first level.
2176  if (rd == 0 &&
2177  this->first_level_distribution.size() != 0 &&
2178  this->num_first_level_parts > 1) {
2179 
2180  maxNoPartAlongI = this->num_first_level_parts;
2181  this->max_num_part_along_dim = this->num_first_level_parts;
2182 
2183  mj_part_t sum_first_level_dist = 0;
2184  mj_part_t max_part = 0;
2185 
2186  // Cumulative sum of distribution of parts and size of largest part
2187  for (int i = 0; i < this->num_first_level_parts; ++i) {
2188  sum_first_level_dist += this->first_level_distribution(i);
2189  if (this->first_level_distribution(i) > max_part)
2190  max_part = this->first_level_distribution(i);
2191  }
2192 
2193  // Total parts in largest nonuniform superpart from
2194  // first level partitioning
2195  nfutureNumParts =
2196  this->num_global_parts * max_part / sum_first_level_dist;
2197  }
2198  // Standard uniform partitioning this level
2199  else {
2200  maxNoPartAlongI = this->get_part_count(future_num_parts,
2201  1.0f / (this->recursion_depth - rd));
2202  if (maxNoPartAlongI > this->max_num_part_along_dim)
2203  this->max_num_part_along_dim = maxNoPartAlongI;
2204  nfutureNumParts = future_num_parts / maxNoPartAlongI;
2205  if (future_num_parts % maxNoPartAlongI) {
2206  ++nfutureNumParts;
2207  }
2208  }
2209  future_num_parts = nfutureNumParts;
2210  }
2211  this->total_num_part = this->num_global_parts;
2212 
2213  if(this->divide_to_prime_first) {
2214  this->total_dim_num_reduce_all = this->num_global_parts * 2;
2215  this->last_dim_num_part = this->num_global_parts;
2216  }
2217  else {
2218  //this is the lower bound.
2219  //estimate reduceAll Count here.
2220  //we find the upperbound instead.
2221  size_t p = 1;
2222  for(int i = 0; i < this->recursion_depth; ++i) {
2223  this->total_dim_num_reduce_all += p;
2224  p *= this->max_num_part_along_dim;
2225  }
2226 
2227  if(p / this->max_num_part_along_dim > this->num_global_parts) {
2228  this->last_dim_num_part = this->num_global_parts;
2229  }
2230  else {
2231  this->last_dim_num_part = p / this->max_num_part_along_dim;
2232  }
2233  }
2234  }
2235 
2236  this->total_num_cut = this->total_num_part - 1;
2237  this->max_num_cut_along_dim = this->max_num_part_along_dim - 1;
2238  this->max_num_total_part_along_dim = this->max_num_part_along_dim +
2239  size_t(this->max_num_cut_along_dim);
2240  // maxPartNo is P, maxCutNo = P-1, matTotalPartcount = 2P-1
2241 
2242  // refine the concurrent part count, if it is given bigger than the maximum
2243  // possible part count.
2244  if(this->max_concurrent_part_calculation > this->last_dim_num_part) {
2245  if(this->mj_problemComm->getRank() == 0) {
2246  std::cerr << "Warning: Concurrent part count (" <<
2247  this->max_concurrent_part_calculation <<
2248  ") has been set bigger than maximum amount that can be used." <<
2249  " Setting to:" << this->last_dim_num_part << "." << std::endl;
2250  }
2251  this->max_concurrent_part_calculation = this->last_dim_num_part;
2252  }
2253 }
2254 
2255 /* \brief Tries to determine the part number for current dimension,
2256  * by trying to make the partitioning as square as possible.
2257  * \param num_total_future how many more partitionings are required.
2258  * \param root how many more recursion depth is left.
2259  */
2260 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2261  typename mj_part_t, typename mj_node_t>
2262 inline mj_part_t AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
2263  get_part_count(mj_part_t num_total_future, double root)
2264 {
2265  double fp = pow(num_total_future, root);
2266  mj_part_t ip = mj_part_t(fp);
2267  if(fp - ip < std::numeric_limits<float>::epsilon() * 100) {
2268  return ip;
2269  }
2270  else {
2271  return ip + 1;
2272  }
2273 }
2274 
2275 /* \brief Function returns how many parts that will be obtained after this
2276  * dimension partitioning. It sets how many parts each current part will be
2277  * partitioned into in this dimension to device_num_partitioning_in_current_dim
2278  * view, sets how many total future parts each obtained part will be
2279  * partitioned into in next_future_num_parts_in_parts vector. If part boxes are
2280  * kept, then sets initializes the output_part_boxes as its ancestor.
2281  * \param future_num_part_in_parts: input, how many future parts each current
2282  * part will be partitioned into.
2283  * \param next_future_num_parts_in_parts: output, how many future parts each
2284  * obtained part will be partitioned into.
2285  * \param future_num_parts: output, max number of future parts that will be
2286  * obtained from a single
2287  * \param current_num_parts: input, how many parts are there currently.
2288  * \param current_iteration: input, current dimension iteration number.
2289  * \param input_part_boxes: input, if boxes are kept, current boxes.
2290  * \param output_part_boxes: output, if boxes are kept, the initial box
2291  * boundaries for obtained parts.
2292  * \param atomic_part_count DOCWORK: Documentation
2293  */
2294 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2295  typename mj_part_t, typename mj_node_t>
2296 mj_part_t AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
2297  update_part_num_arrays(
2298  std::vector<mj_part_t> *future_num_part_in_parts,
2299  std::vector<mj_part_t> *next_future_num_parts_in_parts,
2300  mj_part_t &future_num_parts,
2301  mj_part_t current_num_parts,
2302  int current_iteration,
2303  RCP<mj_partBoxVector_t> input_part_boxes,
2304  RCP<mj_partBoxVector_t> output_part_boxes,
2305  mj_part_t atomic_part_count)
2306 {
2307  std::vector<mj_part_t> num_partitioning_in_current_dim;
2308 
2309  // how many parts that will be obtained after this dimension.
2310  mj_part_t output_num_parts = 0;
2311  if(this->part_no_array.size()) {
2312  // when the partNo array is provided as input,
2313  // each current partition will be partition to the same number of parts.
2314  // we dont need to use the future_num_part_in_parts vector in this case.
2315  mj_part_t current_part_no_array =
2316  this->part_no_array(current_iteration);
2317 
2318  if(current_part_no_array < 1) {
2319  std::cout << "Current recursive iteration: " << current_iteration <<
2320  " part_no_array[" << current_iteration << "] is given as:" <<
2321  current_part_no_array << std::endl;
2322  std::terminate();
2323  }
2324  if(current_part_no_array == 1) {
2325  return current_num_parts;
2326  }
2327 
2328  // If using part_no_array, ensure compatibility with num_first_level_parts.
2329  if (this->first_level_distribution.size() != 0 &&
2330  current_iteration == 0 &&
2331  current_part_no_array != this->num_first_level_parts) {
2332  std::cout << "Current recursive iteration: " << current_iteration
2333  << " part_no_array[" << current_iteration << "] is given as: " <<
2334  current_part_no_array << " and contradicts num_first_level_parts: " <<
2335  this->num_first_level_parts << std::endl;
2336  std::terminate();
2337  }
2338 
2339  for(mj_part_t ii = 0; ii < current_num_parts; ++ii) {
2340  num_partitioning_in_current_dim.push_back(current_part_no_array);
2341  }
2342 
2343 /*
2344  std::cout << "\n\nme: " << this->myRank << " current_iteration: " <<
2345  current_iteration << " current_num_parts: " <<
2346  current_num_parts << "\n\n";
2347 
2348  std::cout << "\n\nnum_partitioning_in_current_dim[0]: " <<
2349  num_partitioning_in_current_dim[0] << "\n\n";
2350 
2351  std::cout << "\n\nfuture_num_parts: " << future_num_parts
2352  << " num_partitioning_in_current_dim[0]: " <<
2353  num_partitioning_in_current_dim[0] << " " <<
2354  future_num_parts / num_partitioning_in_current_dim[0] << "\n\n";
2355 */
2356 
2357  future_num_parts /= num_partitioning_in_current_dim[0];
2358  output_num_parts = current_num_parts *
2359  num_partitioning_in_current_dim[0];
2360  if(this->mj_keep_part_boxes) {
2361  for(mj_part_t k = 0; k < current_num_parts; ++k) {
2362  //initialized the output boxes as its ancestor.
2363  for(mj_part_t j = 0; j <
2364  num_partitioning_in_current_dim[0]; ++j) {
2365  output_part_boxes->push_back((*input_part_boxes)[k]);
2366  }
2367  }
2368  }
2369 
2370  // set the how many more parts each part will be divided.
2371  // this is obvious when partNo array is provided as input.
2372  // however, fill this so weights will be calculated according to this array.
2373  for(mj_part_t ii = 0; ii < output_num_parts; ++ii) {
2374  next_future_num_parts_in_parts->push_back(future_num_parts);
2375  }
2376  }
2377  else {
2378  // if partNo array is not provided as input, future_num_part_in_parts
2379  // holds how many parts each part should be divided. Initially it holds a
2380  // single number equal to the total number of global parts.
2381 
2382  // calculate the future_num_parts from beginning,
2383  // since each part might be divided into different number of parts.
2384  future_num_parts = 1;
2385 
2386  // cout << "i:" << i << std::endl;
2387  for(mj_part_t ii = 0; ii < current_num_parts; ++ii) {
2388  // get how many parts a part should be divided.
2389  mj_part_t future_num_parts_of_part_ii = (*future_num_part_in_parts)[ii];
2390 
2391  // get the ideal number of parts that is close to the
2392  // (recursion_depth - i) root of the future_num_parts_of_part_ii.
2393  mj_part_t num_partitions_in_current_dim =
2394  this->get_part_count(future_num_parts_of_part_ii,
2395  1.0 / (this->recursion_depth - current_iteration)
2396  );
2397  if(num_partitions_in_current_dim > this->max_num_part_along_dim) {
2398  std::cerr << "ERROR: maxPartNo calculation is wrong."
2399  " num_partitions_in_current_dim: "
2400  << num_partitions_in_current_dim << " this->max_num_part_along_dim: "
2401  << this->max_num_part_along_dim <<
2402  " this->recursion_depth: " << this->recursion_depth <<
2403  " current_iteration:" << current_iteration <<
2404  " future_num_parts_of_part_ii: " << future_num_parts_of_part_ii <<
2405  " might need to fix max part no calculation for "
2406  "largest_prime_first partitioning." <<
2407  std::endl;
2408  std::terminate();
2409  }
2410  // add this number to vector_num_partitioning_in_current_dim vector.
2411  // num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
2412  // mj_part_t largest_prime_factor = num_partitions_in_current_dim;
2413 
2414  // Update part num arrays when on current_iteration == 0 and
2415  // using nonuniform first level partitioning
2416  // with requested num parts (num_first_level_parts) and
2417  // a requested distribution in parts (first_level_distribution).
2418  if (current_iteration == 0 &&
2419  this->first_level_distribution.size() != 0 &&
2420  this->num_first_level_parts > 1) {
2421  // Only 1 current part to begin and partitions into
2422  // num_first_level_parts many parts
2423  num_partitioning_in_current_dim.push_back(this->num_first_level_parts);
2424 
2425  // The output number of parts from first level partitioning
2426  output_num_parts = this->num_first_level_parts;
2427 
2428  // Remaining parts left to partition for all future levels
2429  future_num_parts /= this->num_first_level_parts;
2430 
2431  mj_part_t max_part = 0;
2432  mj_part_t sum_first_level_dist = 0;
2433 
2434  // Cumulative sum of distribution of first level parts
2435  // and size of largest first level part
2436  for (int i = 0; i < this->num_first_level_parts; ++i) {
2437  sum_first_level_dist += this->first_level_distribution(i);
2438 
2439  if (this->first_level_distribution(i) > max_part)
2440  max_part = this->first_level_distribution(i);
2441  }
2442 
2443  // Maximum # of remaining parts left to partition for all future levels
2444  future_num_parts = this->num_global_parts * max_part / sum_first_level_dist;
2445 
2446  // Number of parts remaining left to partition for each future_part
2447  // The sum must exactly equal global_num_parts
2448  for (int i = 0; i < this->num_first_level_parts; ++i) {
2449  next_future_num_parts_in_parts->push_back(this->first_level_distribution(i) *
2450  this->num_global_parts / sum_first_level_dist);
2451  }
2452  }
2453  else if (this->divide_to_prime_first) {
2454  // Add this number to num_partitioning_in_current_dim vector.
2455  num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
2456 
2457  mj_part_t largest_prime_factor = num_partitions_in_current_dim;
2458 
2459  //increase the output number of parts.
2460  output_num_parts += num_partitions_in_current_dim;
2461 
2462  if (future_num_parts_of_part_ii == atomic_part_count ||
2463  future_num_parts_of_part_ii % atomic_part_count != 0) {
2464  atomic_part_count = 1;
2465  }
2466 
2467  largest_prime_factor =
2468  this->find_largest_prime_factor(future_num_parts_of_part_ii / atomic_part_count);
2469 
2470  // We divide to num_partitions_in_current_dim. But we adjust the weights
2471  // based on largest prime/ if num_partitions_in_current_dim = 2,
2472  // largest prime = 5 --> we divide to 2 parts with weights 3x and 2x.
2473  // if the largest prime is less than part count, we use the part count
2474  // so that we divide uniformly.
2475  if (largest_prime_factor < num_partitions_in_current_dim) {
2476  largest_prime_factor = num_partitions_in_current_dim;
2477  }
2478  //ideal number of future partitions for each part.
2479  mj_part_t ideal_num_future_parts_in_part =
2480  (future_num_parts_of_part_ii / atomic_part_count) / largest_prime_factor;
2481  //if num_partitions_in_current_dim = 2, largest prime = 5 then ideal weight is 2x
2482  mj_part_t ideal_prime_scale = largest_prime_factor / num_partitions_in_current_dim;
2483 
2484 /*
2485  std::cout << "\ncurrent num part: " << ii
2486  << " largest_prime_factor: " << largest_prime_factor
2487  << " To Partition: " << future_num_parts_of_part_ii << "\n\n";
2488 */
2489 
2490  for (mj_part_t iii = 0; iii < num_partitions_in_current_dim; ++iii) {
2491  //if num_partitions_in_current_dim = 2, largest prime = 5 then ideal weight is 2x
2492  mj_part_t my_ideal_primescale = ideal_prime_scale;
2493  //left over weighs. Left side is adjusted to be 3x, right side stays as 2x
2494  if (iii < (largest_prime_factor) % num_partitions_in_current_dim) {
2495  ++my_ideal_primescale;
2496  }
2497  //scale with 'x';
2498  mj_part_t num_future_parts_for_part_iii =
2499  ideal_num_future_parts_in_part * my_ideal_primescale;
2500 
2501  //if there is a remainder in the part increase the part weight.
2502  if (iii < (future_num_parts_of_part_ii / atomic_part_count) % largest_prime_factor) {
2503  //if not uniform, add 1 for the extra parts.
2504  ++num_future_parts_for_part_iii;
2505  }
2506 
2507  next_future_num_parts_in_parts->push_back(num_future_parts_for_part_iii * atomic_part_count);
2508 
2509  //if part boxes are stored, initialize the box of the parts as the ancestor.
2510  if (this->mj_keep_part_boxes) {
2511  output_part_boxes->push_back((*input_part_boxes)[ii]);
2512  }
2513 
2514  //set num future_num_parts to maximum in this part.
2515  if (num_future_parts_for_part_iii > future_num_parts)
2516  future_num_parts = num_future_parts_for_part_iii;
2517 
2518  }
2519  }
2520  else {
2521  // Add this number to num_partitioning_in_current_dim vector.
2522  num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
2523 
2524  //increase the output number of parts.
2525  output_num_parts += num_partitions_in_current_dim;
2526 
2527  if((future_num_parts_of_part_ii == atomic_part_count) ||
2528  (future_num_parts_of_part_ii % atomic_part_count != 0)) {
2529  atomic_part_count = 1;
2530  }
2531  //ideal number of future partitions for each part.
2532  mj_part_t ideal_num_future_parts_in_part =
2533  (future_num_parts_of_part_ii / atomic_part_count) /
2534  num_partitions_in_current_dim;
2535  for(mj_part_t iii = 0; iii < num_partitions_in_current_dim; ++iii) {
2536  mj_part_t num_future_parts_for_part_iii =
2537  ideal_num_future_parts_in_part;
2538 
2539  //if there is a remainder in the part increase the part weight.
2540  if(iii < (future_num_parts_of_part_ii / atomic_part_count) %
2541  num_partitions_in_current_dim) {
2542  // if not uniform, add 1 for the extra parts.
2543  ++num_future_parts_for_part_iii;
2544  }
2545 
2546  next_future_num_parts_in_parts->push_back(
2547  num_future_parts_for_part_iii * atomic_part_count);
2548 
2549  // if part boxes are stored, initialize the box of the parts as
2550  // the ancestor.
2551  if(this->mj_keep_part_boxes) {
2552  output_part_boxes->push_back((*input_part_boxes)[ii]);
2553  }
2554  //set num future_num_parts to maximum in this part.
2555  if(num_future_parts_for_part_iii > future_num_parts)
2556  future_num_parts = num_future_parts_for_part_iii;
2557  }
2558  }
2559  }
2560  }
2561  // move temp std::vector to host view
2562  device_num_partitioning_in_current_dim = Kokkos::View<
2563  mj_part_t*, device_t>("test", num_partitioning_in_current_dim.size());
2564  host_num_partitioning_in_current_dim =
2565  Kokkos::create_mirror_view(device_num_partitioning_in_current_dim);
2566  for(size_t n = 0; n < num_partitioning_in_current_dim.size(); ++n) {
2567  host_num_partitioning_in_current_dim(n) =
2568  num_partitioning_in_current_dim[n];
2569  }
2570  // setup device equivalent - this data is used on host and device and it's
2571  // more efficient to just setup array on both sides now rather than copy
2572  // values as needed later.
2573  Kokkos::deep_copy(device_num_partitioning_in_current_dim,
2574  host_num_partitioning_in_current_dim);
2575  return output_num_parts;
2576 }
2577 
2578 /* \brief Allocates and initializes the work memory that will be used by MJ.
2579  * */
2580 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2581  typename mj_part_t, typename mj_node_t>
2582 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
2583  allocate_set_work_memory()
2584 {
2585  // Throughout the partitioning execution,
2586  // instead of the moving the coordinates, hold a permutation array for parts.
2587  // coordinate_permutations holds the current permutation.
2588  this->coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>(
2589  Kokkos::ViewAllocateWithoutInitializing("coordinate_permutations"),
2590  this->num_local_coords);
2591  auto local_coordinate_permutations = coordinate_permutations;
2592  Kokkos::parallel_for(
2593  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t> (
2594  0, this->num_local_coords), KOKKOS_LAMBDA (mj_lno_t i) {
2595  local_coordinate_permutations(i) = i;
2596  });
2597 
2598  // new_coordinate_permutations holds the current permutation.
2599  this->new_coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>(
2600  Kokkos::ViewAllocateWithoutInitializing("num_local_coords"),
2601  this->num_local_coords);
2602 
2603  this->assigned_part_ids = Kokkos::View<mj_part_t*, device_t>(
2604  Kokkos::ViewAllocateWithoutInitializing("assigned parts"), 0);
2605  if(this->num_local_coords > 0) {
2606  this->assigned_part_ids = Kokkos::View<mj_part_t*, device_t>(
2607  Kokkos::ViewAllocateWithoutInitializing("assigned part ids"),
2608  this->num_local_coords);
2609  }
2610 
2611  // single partition starts at index-0, and ends at numLocalCoords
2612  // inTotalCounts array holds the end points in coordinate_permutations array
2613  // for each partition. Initially sized 1, and single element is set to
2614  // numLocalCoords.
2615  this->part_xadj = Kokkos::View<mj_lno_t*, device_t>(
2616  Kokkos::ViewAllocateWithoutInitializing("part xadj"), 1);
2617  this->host_part_xadj = Kokkos::create_mirror_view(part_xadj);
2618  host_part_xadj(0) = num_local_coords;
2619  Kokkos::deep_copy(this->part_xadj, host_part_xadj);
2620 
2621  // the ends points of the output, this is allocated later.
2622  this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
2623  Kokkos::ViewAllocateWithoutInitializing("empty"), 0);
2624 
2625  // only store this much if cuts are needed to be stored.
2626  this->all_cut_coordinates = Kokkos::View<mj_scalar_t*, device_t>(
2627  Kokkos::ViewAllocateWithoutInitializing("all cut coordinates"),
2628  this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2629 
2630  // how much weight percentage should a MPI put left side of the each cutline
2631  this->process_cut_line_weight_to_put_left = Kokkos::View<mj_scalar_t*,
2632  device_t>(Kokkos::ViewAllocateWithoutInitializing("empty"), 0);
2633 
2634  // how much weight percentage should each thread in MPI put left side of
2635  // each outline
2636  this->thread_cut_line_weight_to_put_left =
2637  Kokkos::View<mj_scalar_t*, device_t>(
2638  Kokkos::ViewAllocateWithoutInitializing("empty"), 0);
2639 
2640  if(this->distribute_points_on_cut_lines) {
2641  this->process_cut_line_weight_to_put_left =
2642  Kokkos::View<mj_scalar_t *, device_t>(
2643  Kokkos::ViewAllocateWithoutInitializing(
2644  "process_cut_line_weight_to_put_left"),
2645  this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2646  this->thread_cut_line_weight_to_put_left =
2647  Kokkos::View<mj_scalar_t *, device_t>(
2648  Kokkos::ViewAllocateWithoutInitializing(
2649  "thread_cut_line_weight_to_put_left"),
2650  this->max_num_cut_along_dim);
2651  this->process_rectilinear_cut_weight =
2652  Kokkos::View<mj_scalar_t *, device_t>(
2653  Kokkos::ViewAllocateWithoutInitializing("process_rectilinear_cut_weight"),
2654  this->max_num_cut_along_dim);
2655  this->global_rectilinear_cut_weight =
2656  Kokkos::View<mj_scalar_t *, device_t>(
2657  Kokkos::ViewAllocateWithoutInitializing("global_rectilinear_cut_weight"),
2658  this->max_num_cut_along_dim);
2659  }
2660 
2661  // work array to manipulate coordinate of cutlines in different iterations.
2662  // necessary because previous cut line information is used for determining
2663  // the next cutline information. therefore, cannot update the cut work array
2664  // until all cutlines are determined.
2665  this->cut_coordinates_work_array =
2666  Kokkos::View<mj_scalar_t *, device_t>(
2667  Kokkos::ViewAllocateWithoutInitializing("cut_coordinates_work_array"),
2668  this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2669 
2670  // cumulative part weight array.
2671  this->target_part_weights = Kokkos::View<mj_scalar_t*, device_t>(
2672  Kokkos::ViewAllocateWithoutInitializing("target_part_weights"),
2673  this->max_num_part_along_dim * this->max_concurrent_part_calculation);
2674 
2675  // upper bound coordinate of a cut line
2676  this->cut_upper_bound_coordinates =
2677  Kokkos::View<mj_scalar_t*, device_t>(
2678  Kokkos::ViewAllocateWithoutInitializing("cut_upper_bound_coordinates"),
2679  this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2680 
2681  // lower bound coordinate of a cut line
2682  this->cut_lower_bound_coordinates =
2683  Kokkos::View<mj_scalar_t*, device_t>(
2684  Kokkos::ViewAllocateWithoutInitializing("cut_lower_bound_coordinates"),
2685  this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2686 
2687  // lower bound weight of a cut line
2688  this->cut_lower_bound_weights =
2689  Kokkos::View<mj_scalar_t*, device_t>(
2690  Kokkos::ViewAllocateWithoutInitializing("cut_lower_bound_weights"),
2691  this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2692 
2693  //upper bound weight of a cut line
2694  this->cut_upper_bound_weights =
2695  Kokkos::View<mj_scalar_t*, device_t>(
2696  Kokkos::ViewAllocateWithoutInitializing("cut_upper_bound_weights"),
2697  this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2698 
2699  // combined array to exchange the min and max coordinate,
2700  // and total weight of part.
2701  this->process_local_min_max_coord_total_weight =
2702  Kokkos::View<mj_scalar_t*, device_t>(
2703  Kokkos::ViewAllocateWithoutInitializing(
2704  "process_local_min_max_coord_total_weight"),
2705  3 * this->max_concurrent_part_calculation);
2706 
2707  // global combined array with the results for min, max and total weight.
2708  this->global_min_max_coord_total_weight =
2709  Kokkos::View<mj_scalar_t*, device_t>(
2710  Kokkos::ViewAllocateWithoutInitializing("global_min_max_coord_total_weight"),
2711  3 * this->max_concurrent_part_calculation);
2712 
2713  // is_cut_line_determined is used to determine if a cutline is
2714  // determined already. If a cut line is already determined, the next
2715  // iterations will skip this cut line.
2716  this->is_cut_line_determined = Kokkos::View<bool *, device_t>(
2717  Kokkos::ViewAllocateWithoutInitializing("is_cut_line_determined"),
2718  this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2719 
2720  // incomplete_cut_count count holds the number of cutlines that have not
2721  // been finalized for each part when concurrentPartCount>1, using this
2722  // information, if incomplete_cut_count[x]==0, then no work is done for
2723  // this part.
2724  this->device_incomplete_cut_count = Kokkos::View<mj_part_t *, device_t>(
2725  Kokkos::ViewAllocateWithoutInitializing("device_incomplete_cut_count"),
2726  this->max_concurrent_part_calculation);
2727  this->incomplete_cut_count =
2728  Kokkos::create_mirror_view(device_incomplete_cut_count);
2729 
2730  // local part weights of each thread.
2731  this->thread_part_weights = Kokkos::View<double *, device_t>(
2732  Kokkos::ViewAllocateWithoutInitializing("thread_part_weights"),
2733  this->max_num_total_part_along_dim * this->max_concurrent_part_calculation);
2734 
2735  this->thread_cut_left_closest_point = Kokkos::View<mj_scalar_t *, device_t>(
2736  Kokkos::ViewAllocateWithoutInitializing("thread_cut_left_closest_point"),
2737  this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2738 
2739  // thread_cut_right_closest_point to hold the closest coordinate to a
2740  // cutline from right (for each thread)
2741  this->thread_cut_right_closest_point = Kokkos::View<mj_scalar_t *, device_t>(
2742  Kokkos::ViewAllocateWithoutInitializing("thread_cut_right_closest_point"),
2743  this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2744 
2745  // to store how many points in each part a thread has.
2746  this->thread_point_counts = Kokkos::View<mj_lno_t *, device_t>(
2747  Kokkos::ViewAllocateWithoutInitializing("thread_point_counts"),
2748  this->max_num_part_along_dim);
2749 
2750  // for faster communication, concatanation of
2751  // totalPartWeights sized 2P-1, since there are P parts and P-1 cut lines
2752  // leftClosest distances sized P-1, since P-1 cut lines
2753  // rightClosest distances size P-1, since P-1 cut lines.
2754  this->total_part_weight_left_right_closests =
2755  Kokkos::View<mj_scalar_t*, device_t>(
2756  Kokkos::ViewAllocateWithoutInitializing(
2757  "total_part_weight_left_right_closests"),
2758  (this->max_num_total_part_along_dim + this->max_num_cut_along_dim * 2) *
2759  this->max_concurrent_part_calculation);
2760 
2761  this->global_total_part_weight_left_right_closests =
2762  Kokkos::View<mj_scalar_t*, device_t>(
2763  Kokkos::ViewAllocateWithoutInitializing(
2764  "global_total_part_weight_left_right_closests"),
2765  (this->max_num_total_part_along_dim +
2766  this->max_num_cut_along_dim * 2) * this->max_concurrent_part_calculation);
2767 
2768  this->current_mj_gnos = Kokkos::View<mj_gno_t*, device_t>(
2769  Kokkos::ViewAllocateWithoutInitializing("gids"), num_local_coords);
2770 
2771  this->owner_of_coordinate = Kokkos::View<int *, Kokkos::HostSpace>(
2772  Kokkos::ViewAllocateWithoutInitializing("owner_of_coordinate"),
2773  num_local_coords);
2774 
2775  // changes owners back to host - so we don't run them on device
2776  // this improves migration code but means we have to serial init here.
2777  // Note we might allow this to be OpenMP when available even for CUDA.
2778  Kokkos::deep_copy(owner_of_coordinate, myActualRank);
2779 
2780  auto local_current_mj_gnos = current_mj_gnos;
2781  auto local_initial_mj_gnos = initial_mj_gnos;
2782  Kokkos::parallel_for(
2783  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2784  (0, num_local_coords), KOKKOS_LAMBDA (mj_lno_t j) {
2785  local_current_mj_gnos(j) = local_initial_mj_gnos(j);
2786  });
2787 }
2788 
2789 /* \brief compute the global bounding box
2790  */
2791 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2792  typename mj_part_t, typename mj_node_t>
2793 void AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t,
2794  mj_node_t>::compute_global_box()
2795 {
2796  //local min coords
2797  mj_scalar_t *mins = new mj_scalar_t[this->coord_dim];
2798  //global min coords
2799  mj_scalar_t *gmins = new mj_scalar_t[this->coord_dim];
2800  //local max coords
2801  mj_scalar_t *maxs = new mj_scalar_t[this->coord_dim];
2802  //global max coords
2803  mj_scalar_t *gmaxs = new mj_scalar_t[this->coord_dim];
2804 
2805  auto local_mj_coordinates = this->mj_coordinates;
2806 
2807  // If we are only doing 2 parts then we don't need these values
2808  // for y and z. Init them all to 0 first
2809  for(int i = 0; i < this->coord_dim; ++i) {
2810  mins[i] = 0;
2811  maxs[i] = 0;
2812  }
2813 
2814  for(int i = 0; i < std::min(this->recursion_depth, this->coord_dim); ++i) {
2815  Kokkos::parallel_reduce("MinReduce",
2816  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2817  (0, this->num_local_coords),
2818  KOKKOS_LAMBDA(mj_lno_t j, mj_scalar_t & running_min) {
2819  if(local_mj_coordinates(j,i) < running_min) {
2820  running_min = local_mj_coordinates(j,i);
2821  }
2822  }, Kokkos::Min<mj_scalar_t>(mins[i]));
2823  Kokkos::parallel_reduce("MaxReduce",
2824  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2825  (0, this->num_local_coords),
2826  KOKKOS_LAMBDA(mj_lno_t j, mj_scalar_t & running_max) {
2827  if(local_mj_coordinates(j,i) > running_max) {
2828  running_max = local_mj_coordinates(j,i);
2829  }
2830  }, Kokkos::Max<mj_scalar_t>(maxs[i]));
2831  }
2832 
2833  reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MIN,
2834  this->coord_dim, mins, gmins
2835  );
2836 
2837  reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MAX,
2838  this->coord_dim, maxs, gmaxs
2839  );
2840 
2841  //create single box with all areas.
2842  global_box = rcp(new mj_partBox_t(0,this->coord_dim,gmins,gmaxs));
2843  //coordinateModelPartBox <mj_scalar_t, mj_part_t> tmpBox (0, coordDim);
2844  delete [] mins;
2845  delete [] gmins;
2846  delete [] maxs;
2847  delete [] gmaxs;
2848 }
2849 
2850 /* \brief for part communication we keep track of the box boundaries.
2851  * This is performed when either asked specifically, or when geometric mapping
2852  * is performed afterwards.
2853  * This function initializes a single box with all global min, max coordinates.
2854  * \param initial_partitioning_boxes the input and output vector for boxes.
2855  */
2856 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2857  typename mj_part_t, typename mj_node_t>
2858 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
2859  mj_node_t>::init_part_boxes(
2860  RCP<mj_partBoxVector_t> & initial_partitioning_boxes)
2861 {
2862  mj_partBox_t tmp_box(*global_box);
2863  initial_partitioning_boxes->push_back(tmp_box);
2864 }
2865 
2870 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2871  typename mj_part_t,
2872  typename mj_node_t>
2873 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
2874  mj_get_local_min_max_coord_totW(
2875  mj_part_t current_work_part,
2876  mj_part_t current_concurrent_num_parts,
2877  Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords)
2878 {
2879  auto local_coordinate_permutations = this->coordinate_permutations;
2880  auto local_process_local_min_max_coord_total_weight =
2881  this->process_local_min_max_coord_total_weight;
2882  auto local_mj_weights = this->mj_weights;
2883 
2884  bool bUniformWeights = mj_uniform_weights(0);
2885 
2886  for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
2887 
2888  mj_part_t concurrent_current_part = current_work_part + kk;
2889  mj_lno_t coordinate_begin_index = concurrent_current_part == 0 ? 0 :
2890  host_part_xadj(concurrent_current_part - 1);
2891  mj_lno_t coordinate_end_index =
2892  host_part_xadj(concurrent_current_part);
2893 
2894  mj_scalar_t my_min_coord = 0;
2895  mj_scalar_t my_max_coord = 0;
2896  mj_scalar_t my_total_weight;
2897  //if the part is empty.
2898  //set the min and max coordinates as reverse.
2899  if(coordinate_begin_index >= coordinate_end_index)
2900  {
2901  my_min_coord = std::numeric_limits<mj_scalar_t>::max();
2902  my_max_coord = -std::numeric_limits<mj_scalar_t>::max();
2903  my_total_weight = 0;
2904  }
2905  else {
2906  // get min
2907  Kokkos::parallel_reduce("get min",
2908  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2909  (coordinate_begin_index, coordinate_end_index),
2910  KOKKOS_LAMBDA (mj_lno_t j, mj_scalar_t & running_min) {
2911  int i = local_coordinate_permutations(j);
2912  if(mj_current_dim_coords(i) < running_min)
2913  running_min = mj_current_dim_coords(i);
2914  }, Kokkos::Min<mj_scalar_t>(my_min_coord));
2915  // get max
2916  Kokkos::parallel_reduce("get max",
2917  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2918  (coordinate_begin_index, coordinate_end_index),
2919  KOKKOS_LAMBDA (mj_lno_t j, mj_scalar_t & running_max) {
2920  int i = local_coordinate_permutations(j);
2921  if(mj_current_dim_coords(i) > running_max)
2922  running_max = mj_current_dim_coords(i);
2923  }, Kokkos::Max<mj_scalar_t>(my_max_coord));
2924  if(bUniformWeights) {
2925  my_total_weight = coordinate_end_index - coordinate_begin_index;
2926  }
2927  else {
2928  my_total_weight = 0;
2929  Kokkos::parallel_reduce("get weight",
2930  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2931  (coordinate_begin_index, coordinate_end_index),
2932  KOKKOS_LAMBDA (mj_lno_t j, mj_scalar_t & lsum) {
2933  int i = local_coordinate_permutations(j);
2934  lsum += local_mj_weights(i,0);
2935  }, my_total_weight);
2936  }
2937  }
2938 
2939  // single write
2940  Kokkos::parallel_for(
2941  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
2942  (0, 1), KOKKOS_LAMBDA (int dummy) {
2943  local_process_local_min_max_coord_total_weight(kk) =
2944  my_min_coord;
2945  local_process_local_min_max_coord_total_weight(
2946  kk + current_concurrent_num_parts) = my_max_coord;
2947  local_process_local_min_max_coord_total_weight(
2948  kk + 2*current_concurrent_num_parts) = my_total_weight;
2949  });
2950  }
2951 }
2952 
2965 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2966  typename mj_part_t, typename mj_node_t>
2967 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
2968  mj_node_t>::mj_get_global_min_max_coord_totW(
2969  mj_part_t current_concurrent_num_parts,
2970  Kokkos::View<mj_scalar_t *, device_t> & local_min_max_total,
2971  Kokkos::View<mj_scalar_t *, device_t> & global_min_max_total) {
2972  // reduce min for first current_concurrent_num_parts elements, reduce
2973  // max for next concurrentPartCount elements, reduce sum for the last
2974  // concurrentPartCount elements.
2975  if(this->comm->getSize() > 1) {
2976  // We're using explicit host here as Spectrum MPI would fail
2977  // with the prior HostMirror UVMSpace to UVMSpace setup.
2978  auto host_local_min_max_total =
2979  Kokkos::create_mirror_view(Kokkos::HostSpace(), local_min_max_total);
2980  auto host_global_min_max_total =
2981  Kokkos::create_mirror_view(Kokkos::HostSpace(), global_min_max_total);
2982  Kokkos::deep_copy(host_local_min_max_total, local_min_max_total);
2984  reductionOp(current_concurrent_num_parts,
2985  current_concurrent_num_parts, current_concurrent_num_parts);
2986  try {
2987  reduceAll<int, mj_scalar_t>(
2988  *(this->comm),
2989  reductionOp,
2990  3 * current_concurrent_num_parts,
2991  host_local_min_max_total.data(),
2992  host_global_min_max_total.data());
2993  }
2994  Z2_THROW_OUTSIDE_ERROR(*(this->mj_env))
2995  Kokkos::deep_copy(global_min_max_total, host_global_min_max_total);
2996  }
2997  else {
2998  mj_part_t s = 3 * current_concurrent_num_parts;
2999  Kokkos::parallel_for(
3000  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
3001  (0, s), KOKKOS_LAMBDA (mj_part_t i) {
3002  global_min_max_total(i) = local_min_max_total(i);
3003  });
3004  }
3005 }
3006 
3039 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3040  typename mj_part_t, typename mj_node_t>
3041 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
3042  mj_get_initial_cut_coords_target_weights(
3043  mj_scalar_t min_coord,
3044  mj_scalar_t max_coord,
3045  mj_part_t num_cuts/*p-1*/ ,
3046  mj_scalar_t global_weight,
3047  /*p - 1 sized, coordinate of each cut line*/
3048  Kokkos::View<mj_scalar_t *, device_t> & initial_cut_coords,
3049  /*cumulative weights, at left side of each cut line. p-1 sized*/
3050  Kokkos::View<mj_scalar_t *, device_t> & current_target_part_weights ,
3051  std::vector <mj_part_t> *future_num_part_in_parts, //the vecto
3052  std::vector <mj_part_t> *next_future_num_parts_in_parts,
3053  mj_part_t concurrent_current_part,
3054  mj_part_t obtained_part_index,
3055  mj_part_t num_target_first_level_parts,
3056  const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & target_first_level_dist)
3057 {
3058  mj_scalar_t coord_range = max_coord - min_coord;
3059 
3060  // We decided we could keep some std::vectors around for now. Eventually
3061  // it would be nice to have everything just as views with some being device
3062  // and some host. This particular case needs a bit of work to get setup
3063  // in a cleaner way so not going to mess with it at the moment.
3064 
3065  bool bUniformPartsCheck =
3066  num_target_first_level_parts <= 1 && this->mj_uniform_parts(0);
3067 
3068  if(!bUniformPartsCheck) {
3069  bool bValidNonUniformTargetWeights =
3070  (num_target_first_level_parts > 1 && target_first_level_dist.size() != 0);
3071  if(!bValidNonUniformTargetWeights) {
3072  std::cerr << "MJ does not support non uniform part weights beyond the first partition" << std::endl;
3073  std::terminate();
3074  }
3075  }
3076 
3077  Kokkos::View<mj_scalar_t*, device_t> device_cumulative(
3078  "device_cumulative", num_cuts);
3079  auto host_cumulative = Kokkos::create_mirror_view(device_cumulative);
3080 
3081  mj_scalar_t cumulative = 0;
3082 
3083  if(bUniformPartsCheck) {
3084  // How many total future parts the part will be partitioned into.
3085  mj_scalar_t total_future_part_count_in_part =
3086  static_cast<mj_scalar_t>((*future_num_part_in_parts)[concurrent_current_part]);
3087 
3088  // How much each part should weigh in ideal case.
3089  mj_scalar_t unit_part_weight =
3090  global_weight / total_future_part_count_in_part;
3091 
3092  for(mj_part_t i = 0; i < num_cuts; ++i) {
3093  cumulative += unit_part_weight * static_cast<mj_scalar_t>((*next_future_num_parts_in_parts)[i + obtained_part_index]);
3094  host_cumulative(i) = cumulative;
3095  }
3096  }
3097  else {
3098  // Sum of entries in the first level partition distribution vector
3099  mj_scalar_t sum_target_first_level_dist = 0.0;
3100  for (int i = 0; i < num_target_first_level_parts; ++i) {
3101  sum_target_first_level_dist += target_first_level_dist(i);
3102  }
3103 
3104  for(mj_part_t i = 0; i < num_cuts; ++i) {
3105  cumulative += global_weight * target_first_level_dist(i) /
3106  sum_target_first_level_dist;
3107  host_cumulative(i) = cumulative;
3108  }
3109  }
3110 
3111  Kokkos::deep_copy(device_cumulative, host_cumulative);
3112 
3113  Kokkos::parallel_for("Write num in parts",
3114  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
3115  (0, num_cuts), KOKKOS_LAMBDA(mj_part_t cut) {
3116  // set target part weight.
3117  current_target_part_weights(cut) = device_cumulative(cut);
3118  initial_cut_coords(cut) = min_coord +
3119  (coord_range * device_cumulative(cut)) / global_weight;
3120  // set this multiple times but here for device handling
3121  current_target_part_weights(num_cuts) = global_weight;
3122  });
3123 
3124  // round the target part weights.
3125  // Note need to discuss regarding DragonFly commits and determine if we
3126  // would not simply check mj_uniform_weights here.
3127  if (!bUniformPartsCheck || this->mj_uniform_weights[0]) {
3128  Kokkos::parallel_for(
3129  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
3130  (0, num_cuts + 1),
3131  KOKKOS_LAMBDA (mj_part_t i) {
3132  current_target_part_weights(i) =
3133  long(current_target_part_weights(i) + 0.5);
3134  });
3135  }
3136 }
3137 
3154 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3155  typename mj_part_t, typename mj_node_t>
3156 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
3157  set_initial_coordinate_parts(
3158  mj_scalar_t &max_coordinate,
3159  mj_scalar_t &min_coordinate,
3160  mj_lno_t coordinate_begin_index,
3161  mj_lno_t coordinate_end_index,
3162  Kokkos::View<mj_lno_t *, device_t> & mj_current_coordinate_permutations,
3163  Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
3164  Kokkos::View<mj_part_t *, device_t> & mj_part_ids,
3165  mj_part_t &partition_count)
3166 {
3167  mj_scalar_t coordinate_range = max_coordinate - min_coordinate;
3168 
3169  // if there is single point, or if all points are along a line.
3170  // set initial part to 0 for all.
3171  if(std::abs(coordinate_range) < this->sEpsilon ) {
3172  Kokkos::parallel_for(
3173  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
3174  (coordinate_begin_index, coordinate_end_index),
3175  KOKKOS_LAMBDA (mj_lno_t ii) {
3176  mj_part_ids(mj_current_coordinate_permutations[ii]) = 0;
3177  });
3178  }
3179  else {
3180  // otherwise estimate an initial part for each coordinate.
3181  // assuming uniform distribution of points.
3182  mj_scalar_t slice = coordinate_range / partition_count;
3183  Kokkos::parallel_for(
3184  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
3185  (coordinate_begin_index, coordinate_end_index),
3186  KOKKOS_LAMBDA (mj_lno_t ii) {
3187  mj_lno_t iii = mj_current_coordinate_permutations[ii];
3188  mj_part_t pp =
3189  mj_part_t((mj_current_dim_coords[iii] - min_coordinate) / slice);
3190  if(pp >= partition_count) {
3191  pp = partition_count - 1; // don't want last coord in an invalid part
3192  }
3193  mj_part_ids[iii] = 2 * pp;
3194  });
3195  }
3196 }
3197 
3212 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3213  typename mj_part_t, typename mj_node_t>
3214 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,mj_node_t>::mj_1D_part(
3215  Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
3216  double used_imbalance_tolerance,
3217  mj_part_t current_work_part,
3218  mj_part_t current_concurrent_num_parts,
3219  Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
3220  mj_part_t total_incomplete_cut_count,
3221  Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count,
3222  Kokkos::View<size_t*, device_t> & view_total_reduction_size)
3223 {
3224  this->temp_cut_coords = current_cut_coordinates;
3225 
3227  *reductionOp = NULL;
3228 
3229  bool bSingleProcess = (this->comm->getSize() == 1);
3230 
3231  std::vector<mj_part_t> temp(host_num_partitioning_in_current_dim.size());
3232  if(!bSingleProcess) {
3233  for(size_t n = 0; n < host_num_partitioning_in_current_dim.size(); ++n) {
3234  temp[n] = host_num_partitioning_in_current_dim(n);
3235  }
3236  reductionOp = new Teuchos::MultiJaggedCombinedReductionOp
3237  <mj_part_t, mj_scalar_t>(
3238  &temp,
3239  current_work_part,
3240  current_concurrent_num_parts);
3241  }
3242 
3243  auto local_cut_lower_bound_coordinates =
3244  cut_lower_bound_coordinates;
3245  auto local_cut_upper_bound_coordinates =
3246  cut_upper_bound_coordinates;
3247  auto local_cut_upper_bound_weights = cut_upper_bound_weights;
3248  auto local_cut_lower_bound_weights = cut_lower_bound_weights;
3249  bool local_distribute_points_on_cut_lines = distribute_points_on_cut_lines;
3250  auto local_process_cut_line_weight_to_put_left =
3251  process_cut_line_weight_to_put_left;
3252  auto local_temp_cut_coords = temp_cut_coords;
3253  auto local_global_total_part_weight_left_right_closests =
3254  global_total_part_weight_left_right_closests;
3255  auto local_cut_coordinates_work_array =
3256  cut_coordinates_work_array;
3257  auto local_part_xadj = part_xadj;
3258  auto local_global_min_max_coord_total_weight =
3259  global_min_max_coord_total_weight;
3260  auto local_target_part_weights =
3261  target_part_weights;
3262  auto local_global_rectilinear_cut_weight =
3263  global_rectilinear_cut_weight;
3264  auto local_process_rectilinear_cut_weight =
3265  process_rectilinear_cut_weight;
3266 
3267  auto local_is_cut_line_determined = this->is_cut_line_determined;
3268  auto local_device_num_partitioning_in_current_dim =
3269  device_num_partitioning_in_current_dim;
3270 
3271  Kokkos::parallel_for(
3272  Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
3273  KOKKOS_LAMBDA (int dummy) {
3274 
3275  // these need to be initialized
3276  view_rectilinear_cut_count(0) = 0;
3277  view_total_reduction_size(0) = 0;
3278 
3279  // initialize the lower and upper bounds of the cuts.
3280  mj_part_t next = 0;
3281  for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i) {
3282  mj_part_t num_part_in_dim =
3283  local_device_num_partitioning_in_current_dim(current_work_part + i);
3284  mj_part_t num_cut_in_dim = num_part_in_dim - 1;
3285  view_total_reduction_size(0) += (4 * num_cut_in_dim + 1);
3286 
3287  for(mj_part_t ii = 0; ii < num_cut_in_dim; ++ii) {
3288  local_is_cut_line_determined(next) = false;
3289  // min coordinate
3290  local_cut_lower_bound_coordinates(next) =
3291  local_global_min_max_coord_total_weight(i);
3292  // max coordinate
3293  local_cut_upper_bound_coordinates(next) =
3294  local_global_min_max_coord_total_weight(
3295  i + current_concurrent_num_parts);
3296  // total weight
3297  local_cut_upper_bound_weights(next) =
3298  local_global_min_max_coord_total_weight(
3299  i + 2 * current_concurrent_num_parts);
3300  local_cut_lower_bound_weights(next) = 0;
3301  if(local_distribute_points_on_cut_lines) {
3302  local_process_cut_line_weight_to_put_left(next) = 0;
3303  }
3304  ++next;
3305  }
3306  }
3307  });
3308 
3309  // loop_count allows the kernel to behave differently on the first loop
3310  // and subsequent loops. First loop we do a binary search and subsequent
3311  // loops we simply step towards our target.
3312  int loop_count = 0;
3313  while (total_incomplete_cut_count != 0) {
3314  this->mj_1D_part_get_part_weights(
3315  current_concurrent_num_parts,
3316  current_work_part,
3317  mj_current_dim_coords,
3318  loop_count);
3319  ++loop_count;
3320 
3321  this->mj_combine_rightleft_and_weights(
3322  current_work_part,
3323  current_concurrent_num_parts);
3324 
3325  // now sum up the results of mpi processors.
3326  if(!bSingleProcess) {
3327  // We're using explicit host here as Spectrum MPI would fail
3328  // with the prior HostMirror UVMSpace to UVMSpace setup.
3329  auto host_total_part_weight_left_right_closests =
3330  Kokkos::create_mirror_view(Kokkos::HostSpace(),
3331  total_part_weight_left_right_closests);
3332  auto host_global_total_part_weight_left_right_closests =
3333  Kokkos::create_mirror_view(Kokkos::HostSpace(),
3334  global_total_part_weight_left_right_closests);
3335 
3336  Kokkos::deep_copy(host_total_part_weight_left_right_closests,
3337  total_part_weight_left_right_closests);
3338 
3339  size_t host_view_total_reduction_size;
3340  Kokkos::parallel_reduce("Read single",
3341  Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
3342  KOKKOS_LAMBDA(int dummy, size_t & set_single) {
3343  set_single = view_total_reduction_size(0);
3344  }, host_view_total_reduction_size);
3345 
3346  reduceAll<int, mj_scalar_t>( *(this->comm), *reductionOp,
3347  host_view_total_reduction_size,
3348  host_total_part_weight_left_right_closests.data(),
3349  host_global_total_part_weight_left_right_closests.data());
3350  Kokkos::deep_copy(global_total_part_weight_left_right_closests,
3351  host_global_total_part_weight_left_right_closests);
3352  }
3353  else {
3354  local_global_total_part_weight_left_right_closests =
3355  this->total_part_weight_left_right_closests;
3356  }
3357 
3358  // how much cut will be shifted for the next part in the concurrent
3359  // part calculation.
3360  mj_part_t cut_shift = 0;
3361 
3362  // how much the concantaneted array will be shifted for the next part
3363  // in concurrent part calculation.
3364  size_t tlr_shift = 0;
3365 
3366  Kokkos::View<mj_part_t*, Kokkos::HostSpace>
3367  save_initial_incomplete_cut_count("save_initial_incomplete_cut_count",
3368  current_concurrent_num_parts);
3369 
3370  for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
3371 
3372  mj_part_t num_parts =
3373  host_num_partitioning_in_current_dim(current_work_part + kk);
3374 
3375  mj_part_t num_cuts = num_parts - 1;
3376  size_t num_total_part = num_parts + size_t (num_cuts);
3377 
3378  //if the cuts of this cut has already been completed.
3379  //nothing to do for this part.
3380  //just update the shift amount and proceed.
3381  mj_part_t kk_incomplete_cut_count = this->incomplete_cut_count(kk);
3382 
3383  if(kk_incomplete_cut_count == 0) {
3384  cut_shift += num_cuts;
3385  tlr_shift += (num_total_part + 2 * num_cuts);
3386  continue;
3387  }
3388 
3389  Kokkos::View<mj_scalar_t *, device_t> current_local_part_weights =
3390  Kokkos::subview(this->total_part_weight_left_right_closests,
3391  std::pair<mj_lno_t, mj_lno_t>(
3392  tlr_shift,
3393  this->total_part_weight_left_right_closests.size()));
3394 
3395  Kokkos::View<mj_scalar_t *, device_t> current_global_tlr =
3396  Kokkos::subview(
3397  local_global_total_part_weight_left_right_closests,
3398  std::pair<mj_lno_t, mj_lno_t>(
3399  tlr_shift,
3400  local_global_total_part_weight_left_right_closests.size()));
3401  Kokkos::View<mj_scalar_t *, device_t>
3402  current_global_left_closest_points =
3403  Kokkos::subview(current_global_tlr,
3404  std::pair<mj_lno_t, mj_lno_t>(
3405  num_total_part,
3406  current_global_tlr.size()));
3407  Kokkos::View<mj_scalar_t *, device_t>
3408  current_global_right_closest_points =
3409  Kokkos::subview(current_global_tlr,
3410  std::pair<mj_lno_t, mj_lno_t>(
3411  num_total_part + num_cuts,
3412  current_global_tlr.size()));
3413  Kokkos::View<mj_scalar_t *, device_t> current_global_part_weights =
3414  current_global_tlr;
3415 
3416  Kokkos::View<bool *, device_t> current_cut_line_determined =
3417  Kokkos::subview(this->is_cut_line_determined,
3418  std::pair<mj_lno_t, mj_lno_t>(
3419  cut_shift,
3420  this->is_cut_line_determined.size()));
3421  Kokkos::View<mj_scalar_t *, device_t> current_part_target_weights =
3422  Kokkos::subview(local_target_part_weights,
3423  std::pair<mj_lno_t, mj_lno_t>(
3424  cut_shift + kk,
3425  local_target_part_weights.size()));
3426  Kokkos::View<mj_scalar_t *, device_t>
3427  current_part_cut_line_weight_to_put_left =
3428  Kokkos::subview(local_process_cut_line_weight_to_put_left,
3429  std::pair<mj_lno_t, mj_lno_t>(
3430  cut_shift,
3431  local_process_cut_line_weight_to_put_left.size()));
3432 
3433  save_initial_incomplete_cut_count(kk) =
3434  kk_incomplete_cut_count;
3435 
3436  Kokkos::View<mj_scalar_t *, device_t>
3437  current_cut_lower_bound_weights =
3438  Kokkos::subview(local_cut_lower_bound_weights,
3439  std::pair<mj_lno_t, mj_lno_t>(
3440  cut_shift,
3441  local_cut_lower_bound_weights.size()));
3442  Kokkos::View<mj_scalar_t *, device_t> current_cut_upper_weights =
3443  Kokkos::subview(local_cut_upper_bound_weights,
3444  std::pair<mj_lno_t, mj_lno_t>(
3445  cut_shift,
3446  local_cut_upper_bound_weights.size()));
3447  Kokkos::View<mj_scalar_t *, device_t> current_cut_upper_bounds =
3448  Kokkos::subview(local_cut_upper_bound_coordinates,
3449  std::pair<mj_lno_t, mj_lno_t>(
3450  cut_shift,
3451  local_cut_upper_bound_coordinates.size()));
3452  Kokkos::View<mj_scalar_t *, device_t> current_cut_lower_bounds =
3453  Kokkos::subview(local_cut_lower_bound_coordinates,
3454  std::pair<mj_lno_t, mj_lno_t>(
3455  cut_shift,
3456  local_cut_lower_bound_coordinates.size()));
3457 
3458  // Now compute the new cut coordinates.
3459  Kokkos::View<mj_scalar_t*, device_t> sub_temp_cut_coords =
3460  Kokkos::subview(this->temp_cut_coords,
3461  std::pair<mj_lno_t, mj_lno_t>(
3462  cut_shift, this->temp_cut_coords.size()));
3463  Kokkos::View<mj_scalar_t*, device_t> sub_cut_coordinates_work_array =
3464  Kokkos::subview(this->cut_coordinates_work_array,
3465  std::pair<mj_lno_t, mj_lno_t>(
3466  cut_shift, this->cut_coordinates_work_array.size()));
3467 
3468  this->mj_get_new_cut_coordinates(
3469  current_concurrent_num_parts,
3470  kk,
3471  num_cuts,
3472  used_imbalance_tolerance,
3473  current_global_part_weights,
3474  current_local_part_weights,
3475  current_part_target_weights,
3476  current_cut_line_determined,
3477  sub_temp_cut_coords,
3478  current_cut_upper_bounds,
3479  current_cut_lower_bounds,
3480  current_global_left_closest_points,
3481  current_global_right_closest_points,
3482  current_cut_lower_bound_weights,
3483  current_cut_upper_weights,
3484  sub_cut_coordinates_work_array,
3485  current_part_cut_line_weight_to_put_left,
3486  view_rectilinear_cut_count);
3487 
3488  cut_shift += num_cuts;
3489  tlr_shift += (num_total_part + 2 * num_cuts);
3490  } // end of kk loop
3491 
3492  for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
3493  mj_part_t iteration_complete_cut_count =
3494  save_initial_incomplete_cut_count(kk) - this->incomplete_cut_count(kk);
3495  total_incomplete_cut_count -= iteration_complete_cut_count;
3496  }
3497 
3498  Kokkos::parallel_for(
3499  Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
3500  (0, local_temp_cut_coords.size()), KOKKOS_LAMBDA(int n) {
3501  auto t = local_temp_cut_coords(n);
3502  local_temp_cut_coords(n) = local_cut_coordinates_work_array(n);
3503  local_cut_coordinates_work_array(n) = t;
3504  });
3505  } // end of the while loop
3506 
3507  // Needed only if keep_cuts; otherwise can simply swap array pointers
3508  // cutCoordinates and cutCoordinatesWork.
3509  // (at first iteration, cutCoordinates == cutCoorindates_tmp).
3510  // computed cuts must be in cutCoordinates.
3511  if(current_cut_coordinates != local_temp_cut_coords) {
3512  Kokkos::parallel_for(
3513  Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
3514  (0, 1), KOKKOS_LAMBDA(int dummy) {
3515  mj_part_t next = 0;
3516  for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i) {
3517  mj_part_t num_parts = -1;
3518  num_parts = local_device_num_partitioning_in_current_dim(
3519  current_work_part + i);
3520  mj_part_t num_cuts = num_parts - 1;
3521  for(mj_part_t ii = 0; ii < num_cuts; ++ii) {
3522  current_cut_coordinates(next + ii) = local_temp_cut_coords(next + ii);
3523  }
3524  next += num_cuts;
3525  }
3526  for(int n = 0; n <
3527  static_cast<int>(local_cut_coordinates_work_array.size()); ++n) {
3528  local_cut_coordinates_work_array(n) = local_temp_cut_coords(n);
3529  }
3530  });
3531  }
3532 
3533  delete reductionOp;
3534 }
3535 
3536 template<class scalar_t>
3538  scalar_t * ptr;
3539 
3540  // With new kokkos setup parallel_reduce will call empty constructor and
3541  // we update the ptr in the init method.
3542  KOKKOS_INLINE_FUNCTION
3543  Zoltan2_MJArrayType() : ptr(NULL) {};
3544 
3545  KOKKOS_INLINE_FUNCTION
3546  Zoltan2_MJArrayType(scalar_t * pSetPtr) : ptr(pSetPtr) {};
3547 };
3548 
3549 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
3550 
3551 template<class policy_t, class scalar_t, class part_t>
3553 
3556  scalar_t max_scalar;
3560 
3561  KOKKOS_INLINE_FUNCTION ArrayCombinationReducer(
3562  scalar_t mj_max_scalar,
3563  value_type &val,
3564  int mj_value_count_rightleft,
3565  int mj_value_count_weights) :
3566  max_scalar(mj_max_scalar),
3567  value(&val),
3568  value_count_rightleft(mj_value_count_rightleft),
3569  value_count_weights(mj_value_count_weights)
3570  {}
3571 
3572  KOKKOS_INLINE_FUNCTION
3574  return *value;
3575  }
3576 
3577  KOKKOS_INLINE_FUNCTION
3578  void join(value_type& dst, const value_type& src) const {
3579  for(int n = 0; n < value_count_weights; ++n) {
3580  dst.ptr[n] += src.ptr[n];
3581  }
3582 
3583  for(int n = value_count_weights + 2;
3584  n < value_count_weights + value_count_rightleft - 2; n += 2) {
3585  if(src.ptr[n] > dst.ptr[n]) {
3586  dst.ptr[n] = src.ptr[n];
3587  }
3588  if(src.ptr[n+1] < dst.ptr[n+1]) {
3589  dst.ptr[n+1] = src.ptr[n+1];
3590  }
3591  }
3592  }
3593 
3594  KOKKOS_INLINE_FUNCTION
3595  void join (volatile value_type& dst, const volatile value_type& src) const {
3596  for(int n = 0; n < value_count_weights; ++n) {
3597  dst.ptr[n] += src.ptr[n];
3598  }
3599 
3600  for(int n = value_count_weights + 2;
3601  n < value_count_weights + value_count_rightleft - 2; n += 2) {
3602  if(src.ptr[n] > dst.ptr[n]) {
3603  dst.ptr[n] = src.ptr[n];
3604  }
3605  if(src.ptr[n+1] < dst.ptr[n+1]) {
3606  dst.ptr[n+1] = src.ptr[n+1];
3607  }
3608  }
3609  }
3610 
3611  KOKKOS_INLINE_FUNCTION void init (value_type& dst) const {
3612  dst.ptr = value->ptr; // must update ptr
3613 
3614  for(int n = 0; n < value_count_weights; ++n) {
3615  dst.ptr[n] = 0;
3616  }
3617 
3618  for(int n = value_count_weights;
3619  n < value_count_weights + value_count_rightleft; n += 2) {
3620  dst.ptr[n] = -max_scalar;
3621  dst.ptr[n+1] = max_scalar;
3622  }
3623  }
3624 };
3625 #endif // KOKKOS_ENABLE_CUDA && KOKKOS_ENABLE_HIP
3626 
3627 template<class policy_t, class scalar_t, class part_t, class index_t,
3628  class device_t, class array_t>
3630  typedef typename policy_t::member_type member_type;
3631  typedef Kokkos::View<scalar_t*> scalar_view_t;
3632 
3633 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
3634  typedef array_t value_type[];
3635 #endif
3636 
3638  array_t max_scalar;
3639 
3641  part_t num_cuts;
3647  Kokkos::View<index_t*, device_t> permutations;
3648  Kokkos::View<scalar_t *, device_t> coordinates;
3649  Kokkos::View<scalar_t**, device_t> weights;
3650  Kokkos::View<part_t*, device_t> parts;
3651  Kokkos::View<scalar_t *, device_t> cut_coordinates;
3652  Kokkos::View<index_t *, device_t> part_xadj;
3654  scalar_t sEpsilon;
3655 
3656 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3657  Kokkos::View<double *, device_t> current_part_weights;
3658  Kokkos::View<scalar_t *, device_t> current_left_closest;
3659  Kokkos::View<scalar_t *, device_t> current_right_closest;
3660 #endif // KOKKOS_ENABLE_CUDA || defined(KOKKOS_ENABLE_HIP)
3661 
3663  int mj_loop_count,
3664  array_t mj_max_scalar,
3665  part_t mj_concurrent_current_part,
3666  part_t mj_num_cuts,
3667  part_t mj_current_work_part,
3668  part_t mj_current_concurrent_num_parts,
3669  part_t mj_left_right_array_size,
3670  part_t mj_weight_array_size,
3671  Kokkos::View<index_t*, device_t> & mj_permutations,
3672  Kokkos::View<scalar_t *, device_t> & mj_coordinates,
3673  Kokkos::View<scalar_t**, device_t> & mj_weights,
3674  Kokkos::View<part_t*, device_t> & mj_parts,
3675  Kokkos::View<scalar_t *, device_t> & mj_cut_coordinates,
3676  Kokkos::View<index_t *, device_t> & mj_part_xadj,
3677  bool mj_uniform_weights0,
3678  scalar_t mj_sEpsilon
3679 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3680  ,Kokkos::View<double *, device_t> & mj_current_part_weights,
3681  Kokkos::View<scalar_t *, device_t> & mj_current_left_closest,
3682  Kokkos::View<scalar_t *, device_t> & mj_current_right_closest
3683 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3684  ) :
3685  loop_count(mj_loop_count),
3686  max_scalar(mj_max_scalar),
3687  concurrent_current_part(mj_concurrent_current_part),
3688  num_cuts(mj_num_cuts),
3689  current_work_part(mj_current_work_part),
3690  current_concurrent_num_parts(mj_current_concurrent_num_parts),
3691  value_count_rightleft(mj_left_right_array_size),
3692  value_count_weights(mj_weight_array_size),
3693  value_count(mj_weight_array_size+mj_left_right_array_size),
3694  permutations(mj_permutations),
3695  coordinates(mj_coordinates),
3696  weights(mj_weights),
3697  parts(mj_parts),
3698  cut_coordinates(mj_cut_coordinates),
3699  part_xadj(mj_part_xadj),
3700  uniform_weights0(mj_uniform_weights0),
3701  sEpsilon(mj_sEpsilon)
3702 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3703  ,current_part_weights(mj_current_part_weights),
3704  current_left_closest(mj_current_left_closest),
3705  current_right_closest(mj_current_right_closest)
3706 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3707  {
3708  }
3709 
3710  size_t team_shmem_size (int team_size) const {
3711 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3712  int result = sizeof(array_t) *
3714 #else
3715  int result = sizeof(array_t) *
3717 #endif
3718 
3719  // pad this to a multiple of 8 or it will run corrupt
3720  int remainder = result % 8;
3721  if(remainder != 0) {
3722  result += 8 - remainder;
3723  }
3724  return result;
3725  }
3726 
3727  KOKKOS_INLINE_FUNCTION
3728 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3729  void operator() (const member_type & teamMember) const {
3730 #else
3731  void operator() (const member_type & teamMember, value_type teamSum) const {
3732 #endif
3733 
3734  index_t all_begin = (concurrent_current_part == 0) ? 0 :
3735  part_xadj(concurrent_current_part - 1);
3736  index_t all_end = part_xadj(concurrent_current_part);
3737 
3738  index_t num_working_points = all_end - all_begin;
3739  int num_teams = teamMember.league_size();
3740 
3741  index_t stride = num_working_points / num_teams;
3742  if((num_working_points % num_teams) > 0) {
3743  stride += 1; // make sure we have coverage for the final points
3744  }
3745 
3746  // the last team may have less work than the other teams
3747  // the last team can be empty (begin > end) if num_teams > stride
3748  // which is true for many teams and small numbers of coords (tests)
3749  index_t begin = all_begin + stride * teamMember.league_rank();
3750  index_t end = begin + stride;
3751  if(end > all_end) {
3752  end = all_end;
3753  }
3754 
3755 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3756  size_t sh_mem_size = sizeof(array_t) * (value_count_weights +
3758 
3759  array_t * shared_ptr = (array_t *) teamMember.team_shmem().get_shmem(
3760  sh_mem_size);
3761 
3762  // init the shared array to 0
3763  Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
3764  for(int n = 0; n < value_count_weights; ++n) {
3765  shared_ptr[n] = 0;
3766  }
3767  for(int n = value_count_weights;
3768  n < value_count_weights + value_count_rightleft; n += 2) {
3769  shared_ptr[n] = -max_scalar;
3770  shared_ptr[n+1] = max_scalar;
3771  }
3772  });
3773  teamMember.team_barrier();
3774 
3775  Kokkos::parallel_for(
3776  Kokkos::TeamThreadRange(teamMember, begin, end),
3777  [=] (index_t ii) {
3778 #else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3779  // create the team shared data - each thread gets one of the arrays
3780  size_t sh_mem_size = sizeof(array_t) * (value_count_weights +
3781  value_count_rightleft) * teamMember.team_size();
3782 
3783  array_t * shared_ptr = (array_t *) teamMember.team_shmem().get_shmem(
3784  sh_mem_size);
3785 
3786  // select the array for this thread
3787  Zoltan2_MJArrayType<array_t> array(&shared_ptr[teamMember.team_rank() *
3789 
3790  // create reducer which handles the Zoltan2_MJArrayType class
3792  max_scalar, array,
3795 
3796  Kokkos::parallel_reduce(
3797  Kokkos::TeamThreadRange(teamMember, begin, end),
3798 #if (__cplusplus > 201703L)
3799  [=, this] (size_t ii, Zoltan2_MJArrayType<array_t>& threadSum) {
3800 #else
3801  [=] (size_t ii, Zoltan2_MJArrayType<array_t>& threadSum) {
3802 #endif
3803 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3804 
3805  int i = permutations(ii);
3806  scalar_t coord = coordinates(i);
3807  array_t w = uniform_weights0 ? 1 : (array_t) weights(i,0);
3808 
3809  // now check each part and it's right cut
3810  index_t part = parts(i)/2;
3811 
3812  int upper = num_cuts;
3813  int lower = 0;
3814 
3815  // binary search - find matching part
3816  while(true) {
3817  scalar_t a = (part == 0) ? -max_scalar : cut_coordinates(part-1);
3818  scalar_t b = (part == num_cuts) ? max_scalar : cut_coordinates(part);
3819 
3820  if(coord >= a + sEpsilon && coord <= b - sEpsilon) {
3821 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3822  Kokkos::atomic_add(&shared_ptr[part*2], w);
3823 #else
3824  threadSum.ptr[part*2] += w;
3825 #endif
3826 
3827  parts(i) = part*2;
3828 
3829  // now handle the left/right closest part
3830 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3831  array_t new_value = (array_t) coord;
3832  array_t prev_value = shared_ptr[value_count_weights + part * 2 + 1];
3833  while(new_value < prev_value) {
3834  prev_value = Kokkos::atomic_compare_exchange(
3835  &shared_ptr[value_count_weights + part * 2 + 1],
3836  prev_value, new_value);
3837  }
3838  prev_value = shared_ptr[value_count_weights + part * 2 + 2];
3839  while(new_value > prev_value) {
3840  prev_value = Kokkos::atomic_compare_exchange(
3841  &shared_ptr[value_count_weights + part * 2 + 2],
3842  prev_value, new_value);
3843  }
3844 #else
3845  // note cut to left needs to set right closest and cut to right needs
3846  // to set left closest. It's index +1 and +2 instead of -1 and +0
3847  // because right/left segment is padded with an extra pair at
3848  // begining and end to avoid branching with if checks.
3849  if(coord < threadSum.ptr[value_count_weights + part * 2 + 1]) {
3850  threadSum.ptr[value_count_weights + part * 2 + 1] = coord;
3851  }
3852  if(coord > threadSum.ptr[value_count_weights + part * 2 + 2]) {
3853  threadSum.ptr[value_count_weights + part * 2 + 2] = coord;
3854  }
3855 #endif
3856 
3857  break;
3858  }
3859  else if(part != num_cuts) {
3860  if(coord < b + sEpsilon && coord > b - sEpsilon) {
3861  // Note if on cut we set right/left closest to the cut itself
3862  // but we add +2 because we buffered the area with an extra slot
3863  // to reduce cuda branching. So it's +2, +3 instead of +0, +1.
3864 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3865  Kokkos::atomic_add(&shared_ptr[part*2+1], w);
3866  shared_ptr[value_count_weights + part * 2 + 2] = b;
3867  shared_ptr[value_count_weights + part * 2 + 3] = b;
3868 #else
3869  threadSum.ptr[part*2+1] += w;
3870  threadSum.ptr[value_count_weights + part * 2 + 2] = b;
3871  threadSum.ptr[value_count_weights + part * 2 + 3] = b;
3872 #endif
3873 
3874  parts(i) = part*2+1;
3875 
3876  // Need to scan up for any other cuts of same coordinate
3877  // This is costly but it's only relevant for the fix4785 test
3878  // which loads a lot of coordinates on the same point, so without
3879  // this our cuts would all just sit at 0.
3880  part_t base_b = part;
3881  scalar_t base_coord = cut_coordinates(base_b);
3882  part += 1;
3883  while(part < num_cuts) {
3884  b = cut_coordinates(part);
3885  scalar_t delta = b - base_coord;
3886  if(delta < 0) delta = -delta;
3887  if(delta < sEpsilon) {
3888  // Note if on cut we set right/left closest to the cut itself
3889  // but we add +2 because we buffered the area with an extra slot
3890  // to reduce cuda branching. So it's +2, +3 instead of +0, +1.
3891 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3892  Kokkos::atomic_add(&shared_ptr[part*2+1], w);
3893  shared_ptr[value_count_weights + part * 2 + 2] = b;
3894  shared_ptr[value_count_weights + part * 2 + 3] = b;
3895 #else
3896  threadSum.ptr[part*2+1] += w;
3897  threadSum.ptr[value_count_weights + part * 2 + 2] = b;
3898  threadSum.ptr[value_count_weights + part * 2 + 3] = b;
3899 #endif
3900  }
3901  else { break; }
3902  ++part;
3903  }
3904  part = base_b - 1;
3905  while(part >= 0) {
3906  b = cut_coordinates(part);
3907  scalar_t delta = b - base_coord;
3908  if(delta < 0) delta = -delta;
3909  if(delta < sEpsilon) {
3910  // Note if on cut we set right/left closest to the cut itself
3911  // but we add +2 because we buffered the area with an extra slot
3912  // to reduce cuda branching. So it's +2, +3 instead of +0, +1.
3913 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3914  Kokkos::atomic_add(&shared_ptr[part*2+1], w);
3915  shared_ptr[value_count_weights + part * 2 + 2] = b;
3916  shared_ptr[value_count_weights + part * 2 + 3] = b;
3917 #else
3918  threadSum.ptr[part*2+1] += w;
3919  threadSum.ptr[value_count_weights + part * 2 + 2] = b;
3920  threadSum.ptr[value_count_weights + part * 2 + 3] = b;
3921 #endif
3922  }
3923  else { break; }
3924  --part;
3925  }
3926 
3927  break;
3928  }
3929  }
3930 
3931  if(loop_count != 0) {
3932  // subsequent loops can just step towards target
3933  if(coord < b) {
3934  part -= 1;
3935  }
3936  else {
3937  part += 1;
3938  }
3939  }
3940  else {
3941  // initial loop binary search
3942  if(coord < b) {
3943  if(part == lower + 1) {
3944  part = lower;
3945  }
3946  else {
3947  upper = part - 1;
3948  part -= (part - lower)/2;
3949  }
3950  }
3951  else if(part == upper - 1) {
3952  part = upper;
3953  }
3954  else {
3955  lower = part + 1;
3956  part += (upper - part)/2;
3957  }
3958  }
3959  }
3960 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3961  });
3962 #else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3963  }, arraySumReducer);
3964 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3965 
3966  teamMember.team_barrier();
3967 
3968  // collect all the team's results
3969 #if (__cplusplus > 201703L)
3970  Kokkos::single(Kokkos::PerTeam(teamMember), [=, this] () {
3971 #else
3972  Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
3973 #endif
3974  for(int n = 0; n < value_count_weights; ++n) {
3975 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3976  Kokkos::atomic_add(&current_part_weights(n),
3977  static_cast<double>(shared_ptr[n]));
3978 #else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3979  teamSum[n] += array.ptr[n];
3980 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3981  }
3982 
3983 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3984  int insert_left = 0;
3985  int insert_right = 0;
3986 #endif
3987 
3988  for(int n = 2 + value_count_weights;
3989  n < value_count_weights + value_count_rightleft - 2; n += 2) {
3990 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3991  scalar_t new_value = shared_ptr[n+1];
3992  scalar_t prev_value = current_right_closest(insert_right);
3993  while(new_value < prev_value) {
3994  prev_value = Kokkos::atomic_compare_exchange(
3995  &current_right_closest(insert_right), prev_value, new_value);
3996  }
3997 
3998  new_value = shared_ptr[n];
3999  prev_value = current_left_closest(insert_left);
4000  while(new_value > prev_value) {
4001  prev_value = Kokkos::atomic_compare_exchange(
4002  &current_left_closest(insert_left), prev_value, new_value);
4003  }
4004 
4005  ++insert_left;
4006  ++insert_right;
4007 #else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4008  if(array.ptr[n] > teamSum[n]) {
4009  teamSum[n] = array.ptr[n];
4010  }
4011  if(array.ptr[n+1] < teamSum[n+1]) {
4012  teamSum[n+1] = array.ptr[n+1];
4013  }
4014 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4015  }
4016  });
4017 
4018  teamMember.team_barrier();
4019  }
4020 
4021 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4022  KOKKOS_INLINE_FUNCTION
4023  void join(value_type dst, const value_type src) const {
4024  for(int n = 0; n < value_count_weights; ++n) {
4025  dst[n] += src[n];
4026  }
4027 
4028  for(int n = value_count_weights + 2;
4029  n < value_count_weights + value_count_rightleft - 2; n += 2) {
4030  if(src[n] > dst[n]) {
4031  dst[n] = src[n];
4032  }
4033  if(src[n+1] < dst[n+1]) {
4034  dst[n+1] = src[n+1];
4035  }
4036  }
4037  }
4038 
4039  KOKKOS_INLINE_FUNCTION void init (value_type dst) const {
4040  for(int n = 0; n < value_count_weights; ++n) {
4041  dst[n] = 0;
4042  }
4043 
4044  for(int n = value_count_weights;
4045  n < value_count_weights + value_count_rightleft; n += 2) {
4046  dst[n] = -max_scalar;
4047  dst[n+1] = max_scalar;
4048  }
4049  }
4050 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4051 };
4052 
4060 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4061  typename mj_part_t, typename mj_node_t>
4062 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t,mj_part_t, mj_node_t>::
4063  mj_1D_part_get_part_weights(
4064  mj_part_t current_concurrent_num_parts,
4065  mj_part_t current_work_part,
4066  Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
4067  int loop_count)
4068 {
4069  auto local_is_cut_line_determined = is_cut_line_determined;
4070  auto local_thread_part_weights = thread_part_weights;
4071  auto local_thread_cut_left_closest_point = thread_cut_left_closest_point;
4072  auto local_thread_cut_right_closest_point = thread_cut_right_closest_point;
4073 
4074  // Create some locals so we don't use this inside the kernels
4075  // which causes problems
4076  auto local_sEpsilon = this->sEpsilon;
4077  auto local_assigned_part_ids = this->assigned_part_ids;
4078  auto local_coordinate_permutations = this->coordinate_permutations;
4079  auto local_mj_weights = this->mj_weights;
4080  auto local_part_xadj = this->part_xadj;
4081  auto local_global_min_max_coord_total_weight =
4082  this->global_min_max_coord_total_weight;
4083 
4084  typedef Kokkos::TeamPolicy<typename mj_node_t::execution_space> policy_t;
4085 
4086  auto local_device_num_partitioning_in_current_dim =
4087  device_num_partitioning_in_current_dim;
4088 
4089  Kokkos::deep_copy(device_incomplete_cut_count, this->incomplete_cut_count);
4090  auto local_device_incomplete_cut_count = device_incomplete_cut_count;
4091 
4092  mj_part_t total_part_shift = 0;
4093 
4094  mj_part_t concurrent_cut_shifts = 0;
4095  for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
4096  Kokkos::View<mj_scalar_t *, device_t> local_temp_cut_coords =
4097  Kokkos::subview(temp_cut_coords, std::pair<mj_lno_t, mj_lno_t>(
4098  concurrent_cut_shifts, temp_cut_coords.size()));
4099 
4100  mj_part_t num_parts =
4101  host_num_partitioning_in_current_dim(current_work_part + kk);
4102  mj_part_t num_cuts = num_parts - 1;
4103  mj_part_t total_part_count = num_parts + num_cuts;
4104  mj_part_t weight_array_length = num_cuts + num_parts;
4105 
4106  // for right/left closest + buffer cut on either side
4107  mj_part_t right_left_array_length = (num_cuts + 2) * 2;
4108 
4109  if(this->incomplete_cut_count(kk) == 0) {
4110  total_part_shift += total_part_count;
4111  concurrent_cut_shifts += num_cuts;
4112  continue;
4113  }
4114 
4115  // if not set use 60 - was initial testing amount but somewhat arbitrary
4116  auto policy_ReduceWeightsFunctor = policy_t(
4117  mj_num_teams ? mj_num_teams : 60, Kokkos::AUTO);
4118 
4119 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4120  int total_array_length =
4121  weight_array_length + right_left_array_length;
4122 #endif
4123 
4124  // Using float here caused some numerical errors for coord on cut calculations.
4125  // Probably that can be fixed with proper epsilon adjustment but since cuda
4126  // doesn't reduce right now the shared memory pressure is no longer relevant.
4127  // Just use scalar_t to match the original algorithm.
4128  typedef mj_scalar_t array_t;
4129 
4130 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4131  Kokkos::View<array_t*, Kokkos::HostSpace> reduce_array("reduce_array", total_array_length);
4132 #endif // KOKKOS_ENABLE_CUDA && KOKKOS_ENABLE_HIP
4133 
4134  int offset_cuts = 0;
4135  for(int kk2 = 0; kk2 < kk; ++kk2) {
4136  offset_cuts +=
4137  host_num_partitioning_in_current_dim(current_work_part + kk2) - 1;
4138  }
4139  Kokkos::View<double *, device_t> my_current_part_weights =
4140  Kokkos::subview(local_thread_part_weights,
4141  std::pair<mj_lno_t, mj_lno_t>(total_part_shift,
4142  total_part_shift + total_part_count));
4143  Kokkos::View<mj_scalar_t *, device_t> my_current_left_closest =
4144  Kokkos::subview(local_thread_cut_left_closest_point,
4145  std::pair<mj_lno_t, mj_lno_t>(
4146  offset_cuts,
4147  local_thread_cut_left_closest_point.size()));
4148  Kokkos::View<mj_scalar_t *, device_t> my_current_right_closest =
4149  Kokkos::subview(local_thread_cut_right_closest_point,
4150  std::pair<mj_lno_t, mj_lno_t>(
4151  offset_cuts,
4152  local_thread_cut_right_closest_point.size()));
4153 
4154  array_t max_scalar = std::numeric_limits<array_t>::max();
4155 
4156 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4157  // initialize values
4158  Kokkos::parallel_for(
4159  Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4160  KOKKOS_LAMBDA (int dummy) {
4161  for(int n = 0; n < weight_array_length; ++n) {
4162  my_current_part_weights(n) = 0;
4163  }
4164  for(int n = 0; n < num_cuts; ++n) {
4165  my_current_left_closest(n) = -max_scalar;
4166  my_current_right_closest(n) = max_scalar;
4167  }
4168  });
4169 #endif
4170 
4171  mj_part_t concurrent_current_part =
4172  current_work_part + kk;
4173 
4174  ReduceWeightsFunctor<policy_t, mj_scalar_t, mj_part_t, mj_lno_t,
4175  typename mj_node_t::device_type, array_t>
4176  teamFunctor(
4177  loop_count,
4178  max_scalar,
4179  concurrent_current_part,
4180  num_cuts,
4181  current_work_part,
4182  current_concurrent_num_parts,
4183  right_left_array_length,
4184  weight_array_length,
4185  coordinate_permutations,
4186  mj_current_dim_coords,
4187  mj_weights,
4188  assigned_part_ids,
4189  local_temp_cut_coords,
4190  part_xadj,
4191  mj_uniform_weights(0), // host and currently only relevant to slot 0
4192  sEpsilon
4193 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4194  ,my_current_part_weights,
4195  my_current_left_closest,
4196  my_current_right_closest
4197 #endif
4198  );
4199 
4200 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4201  Kokkos::parallel_for(policy_ReduceWeightsFunctor, teamFunctor);
4202 #else
4203  Kokkos::parallel_reduce(policy_ReduceWeightsFunctor,
4204  teamFunctor, reduce_array);
4205  Kokkos::fence();
4206 #endif
4207 
4208 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4209  auto hostArray = Kokkos::create_mirror_view(my_current_part_weights);
4210 
4211  for(int i = 0; i < static_cast<int>(total_part_count); ++i) {
4212  hostArray(i) = reduce_array[i];
4213  }
4214 
4215  Kokkos::deep_copy(my_current_part_weights, hostArray);
4216 
4217  auto hostLeftArray = Kokkos::create_mirror_view(my_current_left_closest);
4218  auto hostRightArray = Kokkos::create_mirror_view(my_current_right_closest);
4219  for(mj_part_t cut = 0; cut < num_cuts; ++cut) {
4220  hostLeftArray(cut) = reduce_array[weight_array_length + (cut+1)*2+0];
4221  hostRightArray(cut) = reduce_array[weight_array_length + (cut+1)*2+1];
4222  }
4223  Kokkos::deep_copy(my_current_left_closest, hostLeftArray);
4224  Kokkos::deep_copy(my_current_right_closest, hostRightArray);
4225 #endif
4226 
4227  total_part_shift += total_part_count;
4228  concurrent_cut_shifts += num_cuts;
4229  }
4230 
4231  auto local_temp_cut_coords = temp_cut_coords;
4232 
4233  Kokkos::parallel_for(
4234  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
4235  (0, current_concurrent_num_parts), KOKKOS_LAMBDA(mj_part_t kk) {
4236  mj_part_t num_parts = local_device_num_partitioning_in_current_dim(
4237  current_work_part + kk);
4238  mj_part_t num_cuts = num_parts - 1;
4239  mj_part_t total_part_count = num_parts + num_cuts;
4240 
4241  if(local_device_incomplete_cut_count(kk) > 0) {
4242  // get the prefix sum
4243  // This is an inefficiency but not sure if it matters much
4244  size_t offset = 0;
4245  size_t offset_cuts = 0;
4246  for(mj_part_t kk2 = 0; kk2 < kk; ++kk2) {
4247  auto num_parts_kk2 = local_device_num_partitioning_in_current_dim(
4248  current_work_part + kk2);
4249  offset += num_parts_kk2 * 2 - 1;
4250  offset_cuts += num_parts_kk2 - 1;
4251  }
4252 
4253  for(mj_part_t i = 1; i < total_part_count; ++i) {
4254  // check for cuts sharing the same position; all cuts sharing a position
4255  // have the same weight == total weight for all cuts sharing the
4256  // position. Don't want to accumulate that total weight more than once.
4257  if(i % 2 == 0 && i > 1 && i < total_part_count - 1 &&
4258  std::abs(local_temp_cut_coords(offset_cuts + i / 2) -
4259  local_temp_cut_coords(offset_cuts + i /2 - 1))
4260  < local_sEpsilon) {
4261  // i % 2 = 0 when part i represents the cut coordinate.
4262  // if it is a cut, and if next cut also has the same coordinate, then
4263  // dont addup.
4264  local_thread_part_weights(offset + i)
4265  = local_thread_part_weights(offset + i-2);
4266  continue;
4267  }
4268 
4269  // otherwise do the prefix sum.
4270  local_thread_part_weights(offset + i) +=
4271  local_thread_part_weights(offset + i-1);
4272  }
4273  }
4274  });
4275 }
4276 
4284 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4285  typename mj_part_t, typename mj_node_t>
4286 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
4287  mj_combine_rightleft_and_weights(
4288  mj_part_t current_work_part,
4289  mj_part_t current_concurrent_num_parts)
4290 {
4291  auto local_thread_part_weights = this->thread_part_weights;
4292  auto local_is_cut_line_determined = this->is_cut_line_determined;
4293  auto local_thread_cut_left_closest_point =
4294  this->thread_cut_left_closest_point;
4295  auto local_thread_cut_right_closest_point =
4296  this->thread_cut_right_closest_point;
4297  auto local_total_part_weight_left_right_closests =
4298  this->total_part_weight_left_right_closests;
4299  auto local_device_num_partitioning_in_current_dim =
4300  device_num_partitioning_in_current_dim;
4301  Kokkos::parallel_for(
4302  Kokkos::RangePolicy<typename mj_node_t::execution_space, int>(0,1),
4303  KOKKOS_LAMBDA (int dummy) {
4304 
4305  size_t tlr_array_shift = 0;
4306  mj_part_t cut_shift = 0;
4307  size_t total_part_array_shift = 0;
4308 
4309  // iterate for all concurrent parts to find the left and right closest
4310  // points in the process.
4311  for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i) {
4312 
4313  mj_part_t num_parts_in_part =
4314  local_device_num_partitioning_in_current_dim(current_work_part + i);
4315  mj_part_t num_cuts_in_part = num_parts_in_part - 1;
4316  size_t num_total_part_in_part =
4317  num_parts_in_part + size_t (num_cuts_in_part);
4318 
4319  // iterate for cuts in a single part.
4320  for(int ii = 0; ii < num_cuts_in_part; ++ii) {
4321  mj_part_t next = tlr_array_shift + ii;
4322  mj_part_t cut_index = cut_shift + ii;
4323 
4324  if(!local_is_cut_line_determined(cut_index)) {
4325  mj_scalar_t left_closest_in_process =
4326  local_thread_cut_left_closest_point(cut_index);
4327  mj_scalar_t right_closest_in_process =
4328  local_thread_cut_right_closest_point(cut_index);
4329 
4330  // store the left and right closes points.
4331  local_total_part_weight_left_right_closests(
4332  num_total_part_in_part + next) = left_closest_in_process;
4333 
4334  local_total_part_weight_left_right_closests(
4335  num_total_part_in_part + num_cuts_in_part + next) =
4336  right_closest_in_process;
4337  }
4338  }
4339 
4340  for(size_t j = 0; j < num_total_part_in_part; ++j) {
4341  mj_part_t cut_ind = j / 2 + cut_shift;
4342 
4343  // need to check j != num_total_part_in_part - 1
4344  // which is same as j/2 != num_cuts_in_part.
4345  // we cannot check it using cut_ind, because of the concurrent part
4346  // concantanetion.
4347  if(j == num_total_part_in_part - 1 ||
4348  !local_is_cut_line_determined(cut_ind)) {
4349  double pwj = local_thread_part_weights(total_part_array_shift + j);
4350  local_total_part_weight_left_right_closests(tlr_array_shift + j) = pwj;
4351  }
4352  }
4353 
4354  // set the shift position in the arrays
4355  cut_shift += num_cuts_in_part;
4356  tlr_array_shift += num_total_part_in_part + 2 * num_cuts_in_part;
4357  total_part_array_shift += num_total_part_in_part;
4358  }
4359  });
4360 }
4361 
4374 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4375  typename mj_part_t, typename mj_node_t>
4376 KOKKOS_INLINE_FUNCTION
4377 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
4378  mj_node_t>::mj_calculate_new_cut_position(mj_scalar_t cut_upper_bound,
4379  mj_scalar_t cut_lower_bound,
4380  mj_scalar_t cut_upper_weight,
4381  mj_scalar_t cut_lower_weight,
4382  mj_scalar_t expected_weight,
4383  mj_scalar_t &new_cut_position,
4384  mj_scalar_t sEpsilon) {
4385 
4386  if(std::abs(cut_upper_bound - cut_lower_bound) < sEpsilon) {
4387  new_cut_position = cut_upper_bound; //or lower bound does not matter.
4388  }
4389 
4390  if(std::abs(cut_upper_weight - cut_lower_weight) < sEpsilon) {
4391  new_cut_position = cut_lower_bound;
4392  }
4393 
4394  mj_scalar_t coordinate_range = (cut_upper_bound - cut_lower_bound);
4395  mj_scalar_t weight_range = (cut_upper_weight - cut_lower_weight);
4396  mj_scalar_t my_weight_diff = (expected_weight - cut_lower_weight);
4397 
4398  mj_scalar_t required_shift = (my_weight_diff / weight_range);
4399  int scale_constant = 20;
4400  int shiftint= int (required_shift * scale_constant);
4401  if(shiftint == 0) shiftint = 1;
4402  required_shift = mj_scalar_t (shiftint) / scale_constant;
4403  new_cut_position = coordinate_range * required_shift + cut_lower_bound;
4404 }
4405 
4406 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4407 
4408 template<class policy_t, class scalar_t>
4410 
4415 
4416  KOKKOS_INLINE_FUNCTION ArrayReducer(
4417  value_type &val,
4418  int mj_value_count) :
4419  value(&val),
4420  value_count(mj_value_count)
4421  {}
4422 
4423  KOKKOS_INLINE_FUNCTION
4425  return *value;
4426  }
4427 
4428  KOKKOS_INLINE_FUNCTION
4429  void join(value_type& dst, const value_type& src) const {
4430  for(int n = 0; n < value_count; ++n) {
4431  dst.ptr[n] += src.ptr[n];
4432  }
4433  }
4434 
4435  KOKKOS_INLINE_FUNCTION void init (value_type& dst) const {
4436  dst.ptr = value->ptr; // must update ptr
4437  for(int n = 0; n < value_count; ++n) {
4438  dst.ptr[n] = 0;
4439  }
4440  }
4441 };
4442 
4443 #endif
4444 
4445 template<class policy_t, class scalar_t, class part_t, class index_t,
4446  class device_t, class array_t>
4448  typedef typename policy_t::member_type member_type;
4449  typedef Kokkos::View<scalar_t*> scalar_view_t;
4450 
4451 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4452  typedef array_t value_type[];
4453 #endif
4454 
4457  Kokkos::View<index_t*, device_t> permutations;
4458  Kokkos::View<scalar_t *, device_t> coordinates;
4459  Kokkos::View<part_t*, device_t> parts;
4460  Kokkos::View<index_t *, device_t> part_xadj;
4461  Kokkos::View<index_t *, device_t> track_on_cuts;
4462 
4463 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4464  Kokkos::View<int *, device_t> local_point_counts;
4465 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4466 
4468  part_t mj_concurrent_current_part,
4469  part_t mj_weight_array_size,
4470  Kokkos::View<index_t*, device_t> & mj_permutations,
4471  Kokkos::View<scalar_t *, device_t> & mj_coordinates,
4472  Kokkos::View<part_t*, device_t> & mj_parts,
4473  Kokkos::View<index_t *, device_t> & mj_part_xadj,
4474  Kokkos::View<index_t *, device_t> & mj_track_on_cuts
4475 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4476  ,Kokkos::View<int *, device_t> & mj_local_point_counts
4477 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4478  ) :
4479  concurrent_current_part(mj_concurrent_current_part),
4480  value_count(mj_weight_array_size),
4481  permutations(mj_permutations),
4482  coordinates(mj_coordinates),
4483  parts(mj_parts),
4484  part_xadj(mj_part_xadj),
4485  track_on_cuts(mj_track_on_cuts)
4486 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4487  ,local_point_counts(mj_local_point_counts)
4488 #endif
4489  {
4490  }
4491 
4492  size_t team_shmem_size (int team_size) const {
4493 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4494  int result = sizeof(array_t) * (value_count);
4495 #else
4496  int result = sizeof(array_t) * (value_count) * team_size;
4497 #endif
4498 
4499  // pad this to a multiple of 8 or it will run corrupt
4500  int remainder = result % 8;
4501  if(remainder != 0) {
4502  result += 8 - remainder;
4503  }
4504  return result;
4505  }
4506 
4507  KOKKOS_INLINE_FUNCTION
4508 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4509  void operator() (const member_type & teamMember) const {
4510 #else
4511  void operator() (const member_type & teamMember, value_type teamSum) const {
4512 #endif
4513  index_t all_begin = (concurrent_current_part == 0) ? 0 :
4514  part_xadj(concurrent_current_part - 1);
4515  index_t all_end = part_xadj(concurrent_current_part);
4516 
4517  index_t num_working_points = all_end - all_begin;
4518  int num_teams = teamMember.league_size();
4519 
4520  index_t stride = num_working_points / num_teams;
4521  if((num_working_points % num_teams) > 0) {
4522  stride += 1; // make sure we have coverage for the final points
4523  }
4524 
4525  index_t begin = all_begin + stride * teamMember.league_rank();
4526  index_t end = begin + stride;
4527  if(end > all_end) {
4528  end = all_end; // the last team may have less work than the other teams
4529  }
4530 
4531  int track_on_cuts_insert_index = track_on_cuts.size() - 1;
4532 
4533  // create the team shared data - each thread gets one of the arrays
4534 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4535  size_t sh_mem_size = sizeof(array_t) * (value_count);
4536 #else
4537  size_t sh_mem_size =
4538  sizeof(array_t) * (value_count) * teamMember.team_size();
4539 #endif
4540 
4541  array_t * shared_ptr = (array_t *) teamMember.team_shmem().get_shmem(
4542  sh_mem_size);
4543 
4544 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4545  // init the shared array to 0
4546  Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
4547  for(int n = 0; n < value_count; ++n) {
4548  shared_ptr[n] = 0;
4549  }
4550  });
4551  teamMember.team_barrier();
4552 
4553  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, begin, end),
4554  [=] (index_t ii) {
4555 #else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4556  // select the array for this thread
4557  Zoltan2_MJArrayType<array_t> array(&shared_ptr[teamMember.team_rank() *
4558  (value_count)]);
4559 
4560  // create reducer which handles the Zoltan2_MJArrayType class
4561  ArrayReducer<policy_t, array_t> arrayReducer(array, value_count);
4562 
4563  Kokkos::parallel_reduce(
4564  Kokkos::TeamThreadRange(teamMember, begin, end),
4565 #if (__cplusplus > 201703L)
4566  [=, this] (size_t ii, Zoltan2_MJArrayType<array_t>& threadSum) {
4567 #else
4568  [=] (size_t ii, Zoltan2_MJArrayType<array_t>& threadSum) {
4569 #endif
4570 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4571 
4572  index_t coordinate_index = permutations(ii);
4573  part_t place = parts(coordinate_index);
4574  part_t part = place / 2;
4575  if(place % 2 == 0) {
4576 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4577  Kokkos::atomic_add(&shared_ptr[part], 1);
4578 #else
4579  threadSum.ptr[part] += 1;
4580 #endif
4581 
4582  parts(coordinate_index) = part;
4583  }
4584  else {
4585  // fill a tracking array so we can process these slower points
4586  // in next cycle
4587  index_t set_index = Kokkos::atomic_fetch_add(
4588  &track_on_cuts(track_on_cuts_insert_index), 1);
4589  track_on_cuts(set_index) = ii;
4590  }
4591 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4592  });
4593 #else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4594  }, arrayReducer);
4595 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4596 
4597  teamMember.team_barrier();
4598 
4599  // collect all the team's results
4600 #if (__cplusplus > 201703L)
4601  Kokkos::single(Kokkos::PerTeam(teamMember), [=, this] () {
4602 #else
4603  Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
4604 #endif
4605  for(int n = 0; n < value_count; ++n) {
4606 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4607  Kokkos::atomic_add(&local_point_counts(n), shared_ptr[n]);
4608 #else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4609  teamSum[n] += array.ptr[n];
4610 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4611  }
4612  });
4613 
4614  teamMember.team_barrier();
4615  }
4616 
4617 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4618 
4619  KOKKOS_INLINE_FUNCTION
4620  void join(value_type dst, const value_type src) const {
4621  for(int n = 0; n < value_count; ++n) {
4622  dst[n] += src[n];
4623  }
4624  }
4625 
4626  KOKKOS_INLINE_FUNCTION void init (value_type dst) const {
4627  for(int n = 0; n < value_count; ++n) {
4628  dst[n] = 0;
4629  }
4630  }
4631 #endif
4632 };
4633 
4649 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4650  typename mj_part_t, typename mj_node_t>
4651 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
4652 mj_create_new_partitions(
4653  mj_part_t num_parts,
4654  mj_part_t current_concurrent_work_part,
4655  Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
4656  Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
4657  Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
4658  Kokkos::View<mj_lno_t *, device_t> & out_part_xadj)
4659 {
4660  // Get locals for cuda
4661  auto local_thread_part_weight_work = this->thread_part_weight_work;
4662  auto local_point_counts = this->thread_point_counts;
4663  auto local_distribute_points_on_cut_lines =
4664  this->distribute_points_on_cut_lines;
4665  auto local_thread_cut_line_weight_to_put_left =
4666  this->thread_cut_line_weight_to_put_left;
4667  auto local_sEpsilon = this->sEpsilon;
4668  auto local_coordinate_permutations = this->coordinate_permutations;
4669  auto local_mj_weights = this->mj_weights;
4670  auto local_assigned_part_ids = this->assigned_part_ids;
4671  auto local_new_coordinate_permutations = this->new_coordinate_permutations;
4672 
4673  mj_part_t num_cuts = num_parts - 1;
4674 
4675  Kokkos::parallel_for(
4676  Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4677  KOKKOS_LAMBDA(int dummy) {
4678 
4679  if(local_distribute_points_on_cut_lines) {
4680  for(int i = 0; i < num_cuts; ++i) {
4681  mj_scalar_t left_weight = used_local_cut_line_weight_to_left(i);
4682  if(left_weight > local_sEpsilon) {
4683  // the weight of thread ii on cut.
4684  mj_scalar_t thread_ii_weight_on_cut =
4685  local_thread_part_weight_work(i * 2 + 1) -
4686  local_thread_part_weight_work(i * 2);
4687 
4688  if(thread_ii_weight_on_cut < left_weight) {
4689  // if left weight is bigger than threads weight on cut.
4690  local_thread_cut_line_weight_to_put_left(i) =
4691  thread_ii_weight_on_cut;
4692  }
4693  else {
4694  // if thread's weight is bigger than space, then put only a portion.
4695  local_thread_cut_line_weight_to_put_left(i) = left_weight;
4696  }
4697  left_weight -= thread_ii_weight_on_cut;
4698  }
4699  else {
4700  local_thread_cut_line_weight_to_put_left(i) = 0;
4701  }
4702  }
4703 
4704  // this is a special case. If cutlines share the same coordinate,
4705  // their weights are equal. We need to adjust the ratio for that.
4706  for(mj_part_t i = num_cuts - 1; i > 0 ; --i) {
4707  if(std::abs(current_concurrent_cut_coordinate(i) -
4708  current_concurrent_cut_coordinate(i -1)) < local_sEpsilon) {
4709  local_thread_cut_line_weight_to_put_left(i) -=
4710  local_thread_cut_line_weight_to_put_left(i - 1);
4711  }
4712  local_thread_cut_line_weight_to_put_left(i) =
4713  static_cast<long long>((local_thread_cut_line_weight_to_put_left(i) +
4714  least_signifiance) * significance_mul) /
4715  static_cast<mj_scalar_t>(significance_mul);
4716  }
4717  }
4718 
4719  for(mj_part_t i = 0; i < num_parts; ++i) {
4720  local_point_counts(i) = 0;
4721  }
4722  });
4723 
4724  mj_lno_t coordinate_begin_index =
4725  current_concurrent_work_part == 0 ? 0 :
4726  host_part_xadj(current_concurrent_work_part - 1);
4727  mj_lno_t coordinate_end_index =
4728  host_part_xadj(current_concurrent_work_part);
4729 
4730  mj_lno_t total_on_cut;
4731  Kokkos::parallel_reduce("Get total_on_cut",
4732  Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (
4733  coordinate_begin_index, coordinate_end_index),
4734  KOKKOS_LAMBDA(int ii, mj_lno_t & val) {
4735  mj_lno_t coordinate_index = local_coordinate_permutations(ii);
4736  mj_part_t coordinate_assigned_place =
4737  local_assigned_part_ids(coordinate_index);
4738  if(coordinate_assigned_place % 2 == 1) {
4739  val += 1;
4740  }
4741  }, total_on_cut);
4742 
4743  Kokkos::View<mj_lno_t *, device_t> track_on_cuts;
4744  if(total_on_cut > 0) {
4745  track_on_cuts = Kokkos::View<mj_lno_t *, device_t>(
4746  "track_on_cuts", // would do WithoutInitialization but need last init to 0
4747  total_on_cut + 1); // extra index to use for tracking
4748  }
4749 
4750  // here we need to parallel reduce an array to count coords in each part
4751  // atomically adding, especially for low part count would kill us
4752  // in the original setup we kept arrays allocated for each thread but for
4753  // the cuda version we'd like to avoid allocating N arrays for the number
4754  // of teams/threads which would be complicated based on running openmp or
4755  // cuda.
4756  typedef Kokkos::TeamPolicy<typename mj_node_t::execution_space> policy_t;
4757 
4758  // if not set use 60 - somewhat arbitrary based on initial performance tests
4759  int use_num_teams = mj_num_teams ? mj_num_teams : 60;
4760 
4761  auto policy_ReduceFunctor = policy_t(use_num_teams, Kokkos::AUTO);
4762  typedef int array_t;
4763 
4764  // just need parts - on the cuts will be handled in a separate serial
4765  // call after this.
4766 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4767  Kokkos::View<array_t*, Kokkos::HostSpace> reduce_array("reduce_array", num_parts);
4768 #endif
4769 
4770  ReduceArrayFunctor<policy_t, mj_scalar_t, mj_part_t, mj_lno_t,
4771  typename mj_node_t::device_type, array_t>teamFunctor(
4772  current_concurrent_work_part,
4773  num_parts,
4774  coordinate_permutations,
4775  mj_current_dim_coords,
4776  assigned_part_ids,
4777  part_xadj,
4778  track_on_cuts
4779 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4780  ,local_point_counts
4781 #endif
4782  );
4783 
4784 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4785  Kokkos::parallel_for(policy_ReduceFunctor, teamFunctor);
4786 #else
4787  Kokkos::parallel_reduce(policy_ReduceFunctor, teamFunctor, reduce_array);
4788  Kokkos::fence();
4789 #endif
4790 
4791 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4792  for(mj_part_t part = 0; part < num_parts; ++part) {
4793  local_point_counts(part) = reduce_array[part];
4794  }
4795 #endif
4796 
4797  // the last member is utility used for atomically inserting the values.
4798  // Sorting here avoids potential indeterminancy in the partitioning results
4799  if(track_on_cuts.size() > 0) { // size 0 unused, or size is minimum of 2
4800  auto track_on_cuts_sort = Kokkos::subview(track_on_cuts,
4801  std::pair<mj_lno_t, mj_lno_t>(0, track_on_cuts.size() - 1)); // do not sort last element
4802  Kokkos::sort(track_on_cuts_sort);
4803  }
4804 
4805  bool uniform_weights0 = this->mj_uniform_weights(0);
4806  Kokkos::parallel_for(
4807  Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4808  KOKKOS_LAMBDA (int dummy) {
4809 
4810  for(int j = 0; j < total_on_cut; ++j) {
4811  int ii = track_on_cuts(j);
4812  mj_lno_t coordinate_index = local_coordinate_permutations(ii);
4813  mj_scalar_t coordinate_weight = uniform_weights0 ? 1 :
4814  local_mj_weights(coordinate_index,0);
4815  mj_part_t coordinate_assigned_place =
4816  local_assigned_part_ids(coordinate_index);
4817  mj_part_t coordinate_assigned_part = coordinate_assigned_place / 2;
4818  // if it is on the cut.
4819  if(local_distribute_points_on_cut_lines &&
4820  local_thread_cut_line_weight_to_put_left(
4821  coordinate_assigned_part) > local_sEpsilon) {
4822  // if the rectilinear partitioning is allowed,
4823  // and the thread has still space to put on the left of the cut
4824  // then thread puts the vertex to left.
4825  local_thread_cut_line_weight_to_put_left(
4826  coordinate_assigned_part) -= coordinate_weight;
4827  // if putting the vertex to left increased the weight more
4828  // than expected, and if the next cut is on the same coordinate,
4829  // then we need to adjust how much weight next cut puts to its left as
4830  // well, in order to take care of the imbalance.
4831  if(local_thread_cut_line_weight_to_put_left(
4832  coordinate_assigned_part) < 0 && coordinate_assigned_part <
4833  num_cuts - 1 &&
4834  std::abs(current_concurrent_cut_coordinate(
4835  coordinate_assigned_part+1) -
4836  current_concurrent_cut_coordinate(
4837  coordinate_assigned_part)) < local_sEpsilon)
4838  {
4839  local_thread_cut_line_weight_to_put_left(
4840  coordinate_assigned_part + 1) +=
4841  local_thread_cut_line_weight_to_put_left(
4842  coordinate_assigned_part);
4843  }
4844  ++local_point_counts(coordinate_assigned_part);
4845  local_assigned_part_ids(coordinate_index) =
4846  coordinate_assigned_part;
4847  }
4848  else {
4849  // if there is no more space on the left, put the coordinate to the
4850  // right of the cut.
4851  ++coordinate_assigned_part;
4852  // this while loop is necessary when a line is partitioned into more
4853  // than 2 parts.
4854  while(local_distribute_points_on_cut_lines &&
4855  coordinate_assigned_part < num_cuts)
4856  {
4857  // traverse all the cut lines having the same partitiong
4858  if(std::abs(current_concurrent_cut_coordinate(
4859  coordinate_assigned_part) -
4860  current_concurrent_cut_coordinate(
4861  coordinate_assigned_part - 1)) < local_sEpsilon)
4862  {
4863  // if line has enough space on left, put it there.
4864  if(local_thread_cut_line_weight_to_put_left(
4865  coordinate_assigned_part) > local_sEpsilon &&
4866  local_thread_cut_line_weight_to_put_left(
4867  coordinate_assigned_part) >=
4868  std::abs(local_thread_cut_line_weight_to_put_left(
4869  coordinate_assigned_part) - coordinate_weight))
4870  {
4871  local_thread_cut_line_weight_to_put_left(
4872  coordinate_assigned_part) -= coordinate_weight;
4873  // Again if it put too much on left of the cut,
4874  // update how much the next cut sharing the same coordinate will
4875  // put to its left.
4876  if(local_thread_cut_line_weight_to_put_left(
4877  coordinate_assigned_part) < 0 &&
4878  coordinate_assigned_part < num_cuts - 1 &&
4879  std::abs(current_concurrent_cut_coordinate(
4880  coordinate_assigned_part+1) -
4881  current_concurrent_cut_coordinate(
4882  coordinate_assigned_part)) < local_sEpsilon)
4883  {
4884  local_thread_cut_line_weight_to_put_left(
4885  coordinate_assigned_part + 1) +=
4886  local_thread_cut_line_weight_to_put_left(
4887  coordinate_assigned_part);
4888  }
4889  break;
4890  }
4891  }
4892  else {
4893  break;
4894  }
4895  ++coordinate_assigned_part;
4896  }
4897  local_point_counts(coordinate_assigned_part) += 1;
4898  local_assigned_part_ids(coordinate_index) = coordinate_assigned_part;
4899  }
4900  }
4901 
4902  for(int j = 0; j < num_parts; ++j) {
4903  out_part_xadj(j) = local_point_counts(j);
4904  local_point_counts(j) = 0;
4905 
4906  if(j != 0) {
4907  out_part_xadj(j) += out_part_xadj(j - 1);
4908  local_point_counts(j) += out_part_xadj(j - 1);
4909  }
4910  }
4911  });
4912 
4913  // here we will determine insert indices for N teams
4914  // then all the teams can fill
4915 
4916 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4917 
4918  // This is the fastest so far - just straight atomic writes for CUDA
4919  // However this is not a deterministic result since it is atomic.
4920  // The final result will be deterministic.
4921  Kokkos::parallel_for(
4922  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t> (
4923  coordinate_begin_index, coordinate_end_index),
4924  KOKKOS_LAMBDA (mj_lno_t ii) {
4925  mj_lno_t i = local_coordinate_permutations(ii);
4926  mj_part_t p = local_assigned_part_ids(i);
4927  mj_lno_t idx = Kokkos::atomic_fetch_add(&local_point_counts(p), 1);
4928  local_new_coordinate_permutations(coordinate_begin_index + idx) = i;
4929  });
4930 
4931 #else
4932 
4933 #ifdef KOKKOS_ENABLE_OPENMP
4934  // will return and fix this - revert back to 1 for clear auto testing
4935  const int num_threads = 1; // Kokkos::OpenMP::impl_max_hardware_threads();
4936 #else
4937  const int num_threads = 1;
4938 #endif
4939 
4940  const int num_teams = 1; // cuda is handled above using a different format
4941 
4942  // allow init - we want all 0's first
4943  Kokkos::View<mj_lno_t*, device_t>
4944  point_counter("insert indices", num_teams * num_threads * num_parts);
4945 
4946  // count how many coords per thread
4947  // then we will fill each independently
4948  Kokkos::TeamPolicy<typename mj_node_t::execution_space>
4949  block_policy(num_teams, num_threads);
4950  typedef typename Kokkos::TeamPolicy<typename mj_node_t::execution_space>::
4952  mj_lno_t range = coordinate_end_index - coordinate_begin_index;
4953  mj_lno_t block_size = range / num_teams + 1;
4954  Kokkos::parallel_for(block_policy, KOKKOS_LAMBDA(member_type team_member) {
4955  int team = team_member.league_rank();
4956  int team_offset = team * num_threads * num_parts;
4957  mj_lno_t begin = coordinate_begin_index + team * block_size;
4958  mj_lno_t end = begin + block_size;
4959  if(end > coordinate_end_index) {
4960  end = coordinate_end_index;
4961  }
4962 
4963  Kokkos::parallel_for(Kokkos::TeamThreadRange(team_member, begin, end),
4964  [=] (mj_lno_t ii) {
4965  int thread = team_member.team_rank();
4966  mj_lno_t i = local_coordinate_permutations(ii);
4967  mj_part_t p = local_assigned_part_ids(i);
4968  int index = team_offset + thread * num_parts + p;
4969  ++point_counter(index);
4970  });
4971  });
4972 
4973  // now prefix sum
4974  // we currently have the counts in the slots
4975  // we want the first counter for each part to be 0
4976  // then the rest should be the sum of all the priors
4977  Kokkos::parallel_for(
4978  Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4979  KOKKOS_LAMBDA (int dummy) {
4980  int num_sets = point_counter.size() / num_parts;
4981  for(int set = num_sets - 1; set >= 1; set -=1) {
4982  int base = set * num_parts;
4983  for(int part = 0; part < num_parts; ++part) {
4984  point_counter(base + part) = point_counter(base + part - num_parts);
4985  }
4986  }
4987 
4988  for(int part = 0; part < num_parts; ++part) {
4989  point_counter(part) = 0;
4990  }
4991 
4992  for(int set = 1; set < num_sets; ++set) {
4993  int base = set * num_parts;
4994  for(int part = 0; part < num_parts; ++part) {
4995  point_counter(base + part) += point_counter(base + part - num_parts);
4996  }
4997  }
4998  });
4999 
5000  // now permute
5001  Kokkos::parallel_for(block_policy, KOKKOS_LAMBDA(member_type team_member) {
5002  int team = team_member.league_rank();
5003  int team_offset = team * num_threads * num_parts;
5004  mj_lno_t begin = coordinate_begin_index + team * block_size;
5005  mj_lno_t end = begin + block_size;
5006  if(end > coordinate_end_index) {
5007  end = coordinate_end_index;
5008  }
5009  Kokkos::parallel_for(Kokkos::TeamThreadRange(team_member, begin, end),
5010  [=] (mj_lno_t ii) {
5011  int thread = team_member.team_rank();
5012  mj_lno_t i = local_coordinate_permutations(ii);
5013  mj_part_t p = local_assigned_part_ids(i);
5014  int index = team_offset + thread * num_parts + p;
5015  int set_counter = (point_counter(index)++) + local_point_counts(p);
5016  local_new_coordinate_permutations(coordinate_begin_index + set_counter) = i;
5017  });
5018  });
5019 #endif
5020 }
5021 
5065 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5066  typename mj_part_t, typename mj_node_t>
5067 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
5068  mj_node_t>::mj_get_new_cut_coordinates(
5069  mj_part_t current_concurrent_num_parts,
5070  mj_part_t kk,
5071  const mj_part_t &num_cuts,
5072  const double &used_imbalance_tolerance,
5073  Kokkos::View<mj_scalar_t *, device_t> & current_global_part_weights,
5074  Kokkos::View<mj_scalar_t *, device_t> & current_local_part_weights,
5075  Kokkos::View<mj_scalar_t *, device_t> & current_part_target_weights,
5076  Kokkos::View<bool *, device_t> & current_cut_line_determined,
5077  Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
5078  Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_bounds,
5079  Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bounds,
5080  Kokkos::View<mj_scalar_t *, device_t> & current_global_left_closest_points,
5081  Kokkos::View<mj_scalar_t *, device_t> & current_global_right_closest_points,
5082  Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bound_weights,
5083  Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_weights,
5084  Kokkos::View<mj_scalar_t *, device_t> & new_current_cut_coordinates,
5085  Kokkos::View<mj_scalar_t *, device_t> &
5086  current_part_cut_line_weight_to_put_left,
5087  Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count)
5088 {
5089  Kokkos::deep_copy(device_incomplete_cut_count, this->incomplete_cut_count);
5090 
5091  auto local_device_incomplete_cut_count = device_incomplete_cut_count;
5092  auto local_sEpsilon = sEpsilon;
5093  auto local_distribute_points_on_cut_lines = distribute_points_on_cut_lines;
5094  auto local_global_rectilinear_cut_weight = global_rectilinear_cut_weight;
5095  auto local_process_rectilinear_cut_weight = process_rectilinear_cut_weight;
5096  auto local_global_min_max_coord_total_weight =
5097  global_min_max_coord_total_weight;
5098 
5099  const auto _sEpsilon = this->sEpsilon;
5100  // Note for a 22 part system I tried removing the outer loop
5101  // and doing each sub loop as a simple parallel_for over num_cuts.
5102  // But that was about twice as slow (10ms) as the current form (5ms)
5103  // so I think the overhead of launching the new global parallel kernels
5104  // is costly. This form is just running one team so effectively using
5105  // a single warp to process the cuts. I expect with a lot of parts this
5106  // might need changing.
5107  Kokkos::TeamPolicy<typename mj_node_t::execution_space>
5108  policy_one_team(1, Kokkos::AUTO());
5109  typedef typename Kokkos::TeamPolicy<typename mj_node_t::execution_space>::
5111  Kokkos::parallel_for(policy_one_team, KOKKOS_LAMBDA(member_type team_member) {
5112 
5113  mj_scalar_t min_coordinate =
5114  local_global_min_max_coord_total_weight(kk);
5115  mj_scalar_t max_coordinate =
5116  local_global_min_max_coord_total_weight(
5117  kk + current_concurrent_num_parts);
5118  mj_scalar_t global_total_weight =
5119  local_global_min_max_coord_total_weight(
5120  kk + current_concurrent_num_parts * 2);
5121 
5122  Kokkos::parallel_for(Kokkos::TeamThreadRange (team_member, num_cuts),
5123  [=] (mj_part_t i) {
5124  // if left and right closest points are not set yet,
5125  // set it to the cut itself.
5126  if(min_coordinate -
5127  current_global_left_closest_points(i) > local_sEpsilon) {
5128  current_global_left_closest_points(i) =
5129  current_cut_coordinates(i);
5130  }
5131  if(current_global_right_closest_points(i) -
5132  max_coordinate > local_sEpsilon) {
5133  current_global_right_closest_points(i) =
5134  current_cut_coordinates(i);
5135  }
5136  });
5137  team_member.team_barrier(); // for end of Kokkos::TeamThreadRange
5138 
5139  Kokkos::parallel_for(Kokkos::TeamThreadRange (team_member, num_cuts),
5140  [=] (mj_part_t i) {
5141  using algMJ_t = AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
5142  mj_node_t>;
5143  // seen weight in the part
5144  mj_scalar_t seen_weight_in_part = 0;
5145  // expected weight for part.
5146  mj_scalar_t expected_weight_in_part = 0;
5147  // imbalance for the left and right side of the cut.
5148  double imbalance_on_left = 0, imbalance_on_right = 0;
5149  if(local_distribute_points_on_cut_lines) {
5150  // init the weight on the cut.
5151  local_global_rectilinear_cut_weight(i) = 0;
5152  local_process_rectilinear_cut_weight(i) = 0;
5153  }
5154  bool bContinue = false;
5155  // if already determined at previous iterations,
5156  // then just write the coordinate to new array, and proceed.
5157  if(current_cut_line_determined(i)) {
5158  new_current_cut_coordinates(i) =
5159  current_cut_coordinates(i);
5160  bContinue = true;
5161  }
5162  if(!bContinue) {
5163  //current weight of the part at the left of the cut line.
5164  seen_weight_in_part = current_global_part_weights(i * 2);
5165 
5166  //expected ratio
5167  expected_weight_in_part = current_part_target_weights(i);
5168 
5169  //leftImbalance = imbalanceOf(seenW, globalTotalWeight, expected);
5170  imbalance_on_left = algMJ_t::calculate_imbalance(seen_weight_in_part,
5171  expected_weight_in_part);
5172  // rightImbalance = imbalanceOf(globalTotalWeight - seenW,
5173  // globalTotalWeight, 1 - expected);
5174  imbalance_on_right = algMJ_t::calculate_imbalance(global_total_weight -
5175  seen_weight_in_part, global_total_weight - expected_weight_in_part);
5176  bool is_left_imbalance_valid = std::abs(imbalance_on_left) -
5177  used_imbalance_tolerance < local_sEpsilon ;
5178  bool is_right_imbalance_valid = std::abs(imbalance_on_right) -
5179  used_imbalance_tolerance < local_sEpsilon;
5180  //if the cut line reaches to desired imbalance.
5181  if(is_left_imbalance_valid && is_right_imbalance_valid) {
5182  current_cut_line_determined(i) = true;
5183  Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5184  new_current_cut_coordinates(i) = current_cut_coordinates(i);
5185  }
5186  else if(imbalance_on_left < 0) {
5187  //if left imbalance < 0 then we need to move the cut to right.
5188  if(local_distribute_points_on_cut_lines) {
5189  // if it is okay to distribute the coordinate on
5190  // the same coordinate to left and right.
5191  // then check if we can reach to the target weight by including the
5192  // coordinates in the part.
5193  if(current_global_part_weights(i * 2 + 1) ==
5194  expected_weight_in_part) {
5195  // if it is we are done.
5196  current_cut_line_determined(i) = true;
5197  Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5198 
5199  //then assign everything on the cut to the left of the cut.
5200  new_current_cut_coordinates(i) =
5201  current_cut_coordinates(i);
5202  //for this cut all the weight on cut will be put to left.
5203  current_part_cut_line_weight_to_put_left(i) =
5204  current_local_part_weights(i * 2 + 1) -
5205  current_local_part_weights(i * 2);
5206  bContinue = true;
5207  }
5208  else if(current_global_part_weights(i * 2 + 1) >
5209  expected_weight_in_part) {
5210  // if the weight is larger than the expected weight,
5211  // then we need to distribute some points to left, some to right.
5212  current_cut_line_determined(i) = true;
5213  Kokkos::atomic_add(&view_rectilinear_cut_count(0), 1);
5214 
5215  // increase the num cuts to be determined with rectilinear
5216  // partitioning.
5217  Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5218  new_current_cut_coordinates(i) =
5219  current_cut_coordinates(i);
5220  local_process_rectilinear_cut_weight[i] =
5221  current_local_part_weights(i * 2 + 1) -
5222  current_local_part_weights(i * 2);
5223  bContinue = true;
5224  }
5225  }
5226 
5227  if(!bContinue) {
5228 
5229  // we need to move further right,so set lower bound to current line,
5230  // and shift it to the closes point from right.
5231  current_cut_lower_bounds(i) =
5232  current_global_right_closest_points(i);
5233 
5234  //set the lower bound weight to the weight we have seen.
5235  current_cut_lower_bound_weights(i) = seen_weight_in_part;
5236 
5237  // compare the upper bound with what has been found in the
5238  // last iteration.
5239  // we try to make more strict bounds for the cut here.
5240  for(mj_part_t ii = i + 1; ii < num_cuts ; ++ii) {
5241  mj_scalar_t p_weight = current_global_part_weights(ii * 2);
5242  mj_scalar_t line_weight =
5243  current_global_part_weights(ii * 2 + 1);
5244  if(p_weight >= expected_weight_in_part) {
5245  // if a cut on the right has the expected weight, then we found
5246  // our cut position. Set up and low coordiantes to this
5247  // new cut coordinate, but we need one more iteration to
5248  // finalize the cut position, as wee need to update the part ids.
5249  if(p_weight == expected_weight_in_part) {
5250  current_cut_upper_bounds(i) =
5251  current_cut_coordinates(ii);
5252  current_cut_upper_weights(i) = p_weight;
5253  current_cut_lower_bounds(i) =
5254  current_cut_coordinates(ii);
5255  current_cut_lower_bound_weights(i) = p_weight;
5256  } else if(p_weight < current_cut_upper_weights(i)) {
5257  // if a part weight is larger then my expected weight,
5258  // but lower than my upper bound weight, update upper bound.
5259  current_cut_upper_bounds(i) =
5260  current_global_left_closest_points(ii);
5261  current_cut_upper_weights(i) = p_weight;
5262  }
5263  break;
5264  }
5265  // if comes here then pw < ew
5266  // then compare the weight against line weight.
5267  if(line_weight >= expected_weight_in_part) {
5268  // if the line is larger than the expected weight, then we need
5269  // to reach to the balance by distributing coordinates on
5270  // this line.
5271  current_cut_upper_bounds(i) =
5272  current_cut_coordinates(ii);
5273  current_cut_upper_weights(i) = line_weight;
5274  current_cut_lower_bounds(i) =
5275  current_cut_coordinates(ii);
5276  current_cut_lower_bound_weights(i) = p_weight;
5277  break;
5278  }
5279  // if a stricter lower bound is found,
5280  // update the lower bound.
5281  if(p_weight <= expected_weight_in_part && p_weight >=
5282  current_cut_lower_bound_weights(i)) {
5283  current_cut_lower_bounds(i) =
5284  current_global_right_closest_points(ii);
5285  current_cut_lower_bound_weights(i) = p_weight;
5286  }
5287  }
5288 
5289  mj_scalar_t new_cut_position = 0;
5290  algMJ_t::mj_calculate_new_cut_position(
5291  current_cut_upper_bounds(i),
5292  current_cut_lower_bounds(i),
5293  current_cut_upper_weights(i),
5294  current_cut_lower_bound_weights(i),
5295  expected_weight_in_part, new_cut_position,
5296  _sEpsilon);
5297 
5298  // if cut line does not move significantly.
5299  // then finalize the search.
5300  if(std::abs(current_cut_coordinates(i) -
5301  new_cut_position) < local_sEpsilon) {
5302  current_cut_line_determined(i) = true;
5303  Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5304 
5305  //set the cut coordinate and proceed.
5306  new_current_cut_coordinates(i) =
5307  current_cut_coordinates(i);
5308  } else {
5309  new_current_cut_coordinates(i) = new_cut_position;
5310  }
5311  } // bContinue
5312  } else {
5313  // need to move the cut line to left.
5314  // set upper bound to current line.
5315  current_cut_upper_bounds(i) =
5316  current_global_left_closest_points(i);
5317  current_cut_upper_weights(i) =
5318  seen_weight_in_part;
5319  // compare the current cut line weights with
5320  // previous upper and lower bounds.
5321  for(int ii = i - 1; ii >= 0; --ii) {
5322  mj_scalar_t p_weight =
5323  current_global_part_weights(ii * 2);
5324  mj_scalar_t line_weight =
5325  current_global_part_weights(ii * 2 + 1);
5326  if(p_weight <= expected_weight_in_part) {
5327  if(p_weight == expected_weight_in_part) {
5328  // if the weight of the part is my expected weight
5329  // then we find the solution.
5330  current_cut_upper_bounds(i) =
5331  current_cut_coordinates(ii);
5332  current_cut_upper_weights(i) = p_weight;
5333  current_cut_lower_bounds(i) =
5334  current_cut_coordinates(ii);
5335  current_cut_lower_bound_weights(i) = p_weight;
5336  }
5337  else if(p_weight > current_cut_lower_bound_weights(i)) {
5338  // if found weight is bigger than the lower bound
5339  // then update the lower bound.
5340  current_cut_lower_bounds(i) =
5341  current_global_right_closest_points(ii);
5342  current_cut_lower_bound_weights(i) = p_weight;
5343 
5344  // at the same time, if weight of line is bigger than the
5345  // expected weight, then update the upper bound as well.
5346  // in this case the balance will be obtained by distributing
5347  // weights on this cut position.
5348  if(line_weight > expected_weight_in_part) {
5349  current_cut_upper_bounds(i) =
5350  current_global_right_closest_points(ii);
5351  current_cut_upper_weights(i) = line_weight;
5352  }
5353  }
5354  break;
5355  }
5356  // if the weight of the cut on the left is still bigger than
5357  // my weight, and also if the weight is smaller than the current
5358  // upper weight, or if the weight is equal to current upper
5359  // weight, but on the left of the upper weight, then update
5360  // upper bound.
5361  if(p_weight >= expected_weight_in_part &&
5362  (p_weight < current_cut_upper_weights(i) ||
5363  (p_weight == current_cut_upper_weights(i) &&
5364  current_cut_upper_bounds(i) >
5365  current_global_left_closest_points(ii)))) {
5366  current_cut_upper_bounds(i) =
5367  current_global_left_closest_points(ii);
5368  current_cut_upper_weights(i) = p_weight;
5369  }
5370  }
5371  mj_scalar_t new_cut_position = 0;
5372  algMJ_t::mj_calculate_new_cut_position(
5373  current_cut_upper_bounds(i),
5374  current_cut_lower_bounds(i),
5375  current_cut_upper_weights(i),
5376  current_cut_lower_bound_weights(i),
5377  expected_weight_in_part,
5378  new_cut_position,
5379  _sEpsilon);
5380 
5381  // if cut line does not move significantly.
5382  if(std::abs(current_cut_coordinates(i) -
5383  new_cut_position) < local_sEpsilon) {
5384  current_cut_line_determined(i) = true;
5385  Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5386  //set the cut coordinate and proceed.
5387  new_current_cut_coordinates(i) =
5388  current_cut_coordinates(i);
5389  } else {
5390  new_current_cut_coordinates(i) =
5391  new_cut_position;
5392  }
5393  }
5394  }; // bContinue
5395  });
5396 
5397  team_member.team_barrier(); // for end of Kokkos::TeamThreadRange
5398  });
5399 
5400  // view_rectilinear_cut_count
5401  mj_part_t rectilinear_cut_count;
5402  Kokkos::parallel_reduce("Read bDoingWork",
5403  Kokkos::RangePolicy<typename mj_node_t::execution_space, int>(0, 1),
5404  KOKKOS_LAMBDA(int dummy, int & set_single) {
5405  set_single = view_rectilinear_cut_count(0);
5406  }, rectilinear_cut_count);
5407 
5408  if(rectilinear_cut_count > 0) {
5409  auto host_local_process_rectilinear_cut_weight =
5410  Kokkos::create_mirror_view(Kokkos::HostSpace(),
5411  local_process_rectilinear_cut_weight);
5412  auto host_local_global_rectilinear_cut_weight =
5413  Kokkos::create_mirror_view(Kokkos::HostSpace(),
5414  local_global_rectilinear_cut_weight);
5415  Kokkos::deep_copy(host_local_process_rectilinear_cut_weight,
5416  local_process_rectilinear_cut_weight);
5417  Kokkos::deep_copy(host_local_global_rectilinear_cut_weight,
5418  local_global_rectilinear_cut_weight);
5419  Teuchos::scan<int,mj_scalar_t>(
5420  *comm, Teuchos::REDUCE_SUM,
5421  num_cuts,
5422  host_local_process_rectilinear_cut_weight.data(),
5423  host_local_global_rectilinear_cut_weight.data());
5424  Kokkos::deep_copy(local_process_rectilinear_cut_weight,
5425  host_local_process_rectilinear_cut_weight);
5426  Kokkos::deep_copy(local_global_rectilinear_cut_weight,
5427  host_local_global_rectilinear_cut_weight);
5428 
5429  Kokkos::parallel_for("finish up mj_get_new_cut_coordinates",
5430  Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
5431  KOKKOS_LAMBDA(int dummy) {
5432  for(mj_part_t i = 0; i < num_cuts; ++i) {
5433  // if cut line weight to be distributed.
5434  if(local_global_rectilinear_cut_weight(i) > 0) {
5435  // expected weight to go to left of the cut.
5436  mj_scalar_t expected_part_weight = current_part_target_weights(i);
5437  // the weight that should be put to left of the cut.
5438  mj_scalar_t necessary_weight_on_line_for_left =
5439  expected_part_weight - current_global_part_weights(i * 2);
5440 
5441  // the weight of the cut in the process
5442  mj_scalar_t my_weight_on_line =
5443  local_process_rectilinear_cut_weight(i);
5444 
5445  // the sum of the cut weights upto this process,
5446  // including the weight of this process.
5447  mj_scalar_t weight_on_line_upto_process_inclusive =
5448  local_global_rectilinear_cut_weight(i);
5449  // the space on the left side of the cut after all processes
5450  // before this process (including this process)
5451  // puts their weights on cut to left.
5452  mj_scalar_t space_to_put_left =
5453  necessary_weight_on_line_for_left -
5454  weight_on_line_upto_process_inclusive;
5455  // add my weight to this space to find out how much space
5456  // is left to me.
5457  mj_scalar_t space_left_to_me =
5458  space_to_put_left + my_weight_on_line;
5459 
5460  /*
5461  cout << "expected_part_weight:" << expected_part_weight
5462  << " necessary_weight_on_line_for_left:"
5463  << necessary_weight_on_line_for_left
5464  << " my_weight_on_line" << my_weight_on_line
5465  << " weight_on_line_upto_process_inclusive:"
5466  << weight_on_line_upto_process_inclusive
5467  << " space_to_put_left:" << space_to_put_left
5468  << " space_left_to_me" << space_left_to_me << endl;
5469  */
5470 
5471  if(space_left_to_me < 0) {
5472  // space_left_to_me is negative and i dont need to put
5473  // anything to left.
5474  current_part_cut_line_weight_to_put_left(i) = 0;
5475  }
5476  else if(space_left_to_me >= my_weight_on_line) {
5477  // space left to me is bigger than the weight of the
5478  // processor on cut.
5479  // so put everything to left.
5480  current_part_cut_line_weight_to_put_left(i) =
5481  my_weight_on_line;
5482  // cout << "setting current_part_cut_line_weight_to_put_left
5483  // to my_weight_on_line:" << my_weight_on_line << endl;
5484  }
5485  else {
5486  // put only the weight as much as the space.
5487  current_part_cut_line_weight_to_put_left(i) =
5488  space_left_to_me;
5489  // cout << "setting current_part_cut_line_weight_to_put_left
5490  // to space_left_to_me:" << space_left_to_me << endl;
5491  }
5492  }
5493  }
5494  view_rectilinear_cut_count(0) = 0;
5495  });
5496  }
5497 
5498  Kokkos::deep_copy(this->incomplete_cut_count, device_incomplete_cut_count);
5499 }
5500 
5510 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5511  typename mj_part_t, typename mj_node_t>
5512 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5513  get_processor_num_points_in_parts(
5514  mj_part_t num_procs,
5515  mj_part_t num_parts,
5516  mj_gno_t *&num_points_in_all_processor_parts)
5517 {
5518  // initially allocation_size is num_parts
5519  size_t allocation_size = num_parts * (num_procs + 1);
5520 
5521  // this will be output
5522  // holds how many each processor has in each part.
5523  // last portion is the sum of all processor points in each part.
5524 
5525  // allocate memory for the local num coordinates in each part.
5526  mj_gno_t *num_local_points_in_each_part_to_reduce_sum =
5527  new mj_gno_t[allocation_size];
5528 
5529  // this is the portion of the memory which will be used
5530  // at the summation to obtain total number of processors' points in each part.
5531  mj_gno_t *my_local_points_to_reduce_sum =
5532  num_local_points_in_each_part_to_reduce_sum + num_procs * num_parts;
5533 
5534  // this is the portion of the memory where each stores its local number.
5535  // this information is needed by other processors.
5536  mj_gno_t *my_local_point_counts_in_each_part =
5537  num_local_points_in_each_part_to_reduce_sum + this->myRank * num_parts;
5538 
5539  // initialize the array with 0's.
5540  memset(num_local_points_in_each_part_to_reduce_sum, 0,
5541  sizeof(mj_gno_t)*allocation_size);
5542 
5543  auto local_new_part_xadj = this->new_part_xadj;
5544  Kokkos::View<mj_gno_t *, typename mj_node_t::device_type> points_per_part(
5545  Kokkos::ViewAllocateWithoutInitializing("points per part"), num_parts);
5546  Kokkos::parallel_for("get vals on device",
5547  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_gno_t>
5548  (0, num_parts), KOKKOS_LAMBDA(mj_gno_t i) {
5549  points_per_part(i) =
5550  local_new_part_xadj(i) - ((i == 0) ? 0 : local_new_part_xadj(i-1));
5551  });
5552  auto host_points_per_part = Kokkos::create_mirror_view(points_per_part);
5553  Kokkos::deep_copy(host_points_per_part, points_per_part);
5554  for(int i = 0; i < num_parts; ++i) {
5555  my_local_points_to_reduce_sum[i] = host_points_per_part(i);
5556  }
5557 
5558  // copy the local num parts to the last portion of array, so that this portion
5559  // will represent the global num points in each part after the reduction.
5560  memcpy (my_local_point_counts_in_each_part, my_local_points_to_reduce_sum,
5561  sizeof(mj_gno_t) * (num_parts) );
5562 
5563  // reduceAll operation.
5564  // the portion that belongs to a processor with index p
5565  // will start from myRank * num_parts.
5566  // the global number of points will be held at the index
5567  try{
5568  reduceAll<int, mj_gno_t>(
5569  *(this->comm),
5570  Teuchos::REDUCE_SUM,
5571  allocation_size,
5572  num_local_points_in_each_part_to_reduce_sum,
5573  num_points_in_all_processor_parts);
5574  }
5575  Z2_THROW_OUTSIDE_ERROR(*(this->mj_env))
5576 
5577  delete [] num_local_points_in_each_part_to_reduce_sum;
5578 }
5579 
5595 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5596  typename mj_part_t, typename mj_node_t>
5597 bool AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5598  mj_check_to_migrate(
5599  size_t migration_reduce_all_population,
5600  mj_lno_t num_coords_for_last_dim_part,
5601  mj_part_t num_procs,
5602  mj_part_t num_parts,
5603  mj_gno_t *num_points_in_all_processor_parts)
5604 {
5605  // if reduce all count and population in the last dim is too high
5606  if(migration_reduce_all_population > future_reduceall_cutoff) {
5607  return true;
5608  }
5609 
5610  // if the work in a part per processor in the last dim is too low.
5611  if(num_coords_for_last_dim_part < min_work_last_dim) {
5612  return true;
5613  }
5614 
5615  // if migration is to be checked and the imbalance is too high
5616  if(this->check_migrate_avoid_migration_option == 0) {
5617  double global_imbalance = 0;
5618  // global shift to reach the sum of coordiante count in each part.
5619  size_t global_shift = num_procs * num_parts;
5620 
5621  for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5622  for(mj_part_t i = 0; i < num_parts; ++i) {
5623  double ideal_num = num_points_in_all_processor_parts[global_shift + i]
5624  / double(num_procs);
5625 
5626  global_imbalance += std::abs(ideal_num -
5627  num_points_in_all_processor_parts[ii * num_parts + i]) / (ideal_num);
5628  }
5629  }
5630  global_imbalance /= num_parts;
5631  global_imbalance /= num_procs;
5632 
5633  if(global_imbalance <= this->minimum_migration_imbalance) {
5634  return false;
5635  }
5636  else {
5637  return true;
5638  }
5639  }
5640  else {
5641  // if migration is forced
5642  return true;
5643  }
5644 }
5645 
5659 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5660  typename mj_part_t, typename mj_node_t>
5661 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5662  assign_send_destinations(
5663  mj_part_t num_parts,
5664  mj_part_t *part_assignment_proc_begin_indices,
5665  mj_part_t *processor_chains_in_parts,
5666  mj_lno_t *send_count_to_each_proc,
5667  int *coordinate_destinations) {
5668 
5669  auto host_new_part_xadj = Kokkos::create_mirror_view(this->new_part_xadj);
5670  deep_copy(host_new_part_xadj, this->new_part_xadj);
5671 
5672  auto host_new_coordinate_permutations =
5673  Kokkos::create_mirror_view(this->new_coordinate_permutations);
5674  deep_copy(host_new_coordinate_permutations, this->new_coordinate_permutations);
5675 
5676  for(mj_part_t p = 0; p < num_parts; ++p) {
5677  mj_lno_t part_begin = 0;
5678  if(p > 0) part_begin = host_new_part_xadj(p - 1);
5679  mj_lno_t part_end = host_new_part_xadj(p);
5680  // get the first part that current processor will send its part-p.
5681  mj_part_t proc_to_sent = part_assignment_proc_begin_indices[p];
5682  // initialize how many point I sent to this processor.
5683  mj_lno_t num_total_send = 0;
5684  for(mj_lno_t j=part_begin; j < part_end; j++) {
5685  mj_lno_t local_ind = host_new_coordinate_permutations(j);
5686  while (num_total_send >= send_count_to_each_proc[proc_to_sent]) {
5687  // then get the next processor to send the points in part p.
5688  num_total_send = 0;
5689  // assign new processor to part_assign_begin[p]
5690  part_assignment_proc_begin_indices[p] =
5691  processor_chains_in_parts[proc_to_sent];
5692  // remove the previous processor
5693  processor_chains_in_parts[proc_to_sent] = -1;
5694  // choose the next processor as the next one to send.
5695  proc_to_sent = part_assignment_proc_begin_indices[p];
5696  }
5697  // write the gno index to corresponding position in sendBuf.
5698  coordinate_destinations[local_ind] = proc_to_sent;
5699  ++num_total_send;
5700  }
5701  }
5702 }
5703 
5724 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5725  typename mj_part_t, typename mj_node_t>
5726 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5727  mj_assign_proc_to_parts(
5728  mj_gno_t * num_points_in_all_processor_parts,
5729  mj_part_t num_parts,
5730  mj_part_t num_procs,
5731  mj_lno_t *send_count_to_each_proc,
5732  std::vector<mj_part_t> &processor_ranks_for_subcomm,
5733  std::vector<mj_part_t> *next_future_num_parts_in_parts,
5734  mj_part_t &out_part_index,
5735  mj_part_t &output_part_numbering_begin_index,
5736  int * coordinate_destinations) {
5737  mj_gno_t *global_num_points_in_parts =
5738  num_points_in_all_processor_parts + num_procs * num_parts;
5739  mj_part_t *num_procs_assigned_to_each_part = new mj_part_t[num_parts];
5740 
5741  // boolean variable if the process finds its part to be assigned.
5742  bool did_i_find_my_group = false;
5743 
5744  mj_part_t num_free_procs = num_procs;
5745  mj_part_t minimum_num_procs_required_for_rest_of_parts = num_parts - 1;
5746 
5747  double max_imbalance_difference = 0;
5748  mj_part_t max_differing_part = 0;
5749 
5750  // find how many processor each part requires.
5751  for(mj_part_t i = 0; i < num_parts; i++) {
5752 
5753  // scalar portion of the required processors
5754  double scalar_required_proc = num_procs *
5755  (double (global_num_points_in_parts[i]) /
5756  double (this->num_global_coords));
5757 
5758  // round it to closest integer; make sure have at least one proc.
5759  mj_part_t required_proc =
5760  static_cast<mj_part_t> (0.5 + scalar_required_proc);
5761  if(required_proc == 0) required_proc = 1;
5762 
5763  // if assigning the required num procs, creates problems for the rest
5764  // of the parts, then only assign {num_free_procs -
5765  // (minimum_num_procs_required_for_rest_of_parts)} procs to this part.
5766  if(num_free_procs -
5767  required_proc < minimum_num_procs_required_for_rest_of_parts) {
5768  required_proc = num_free_procs -
5769  (minimum_num_procs_required_for_rest_of_parts);
5770  }
5771 
5772  // reduce the free processor count
5773  num_free_procs -= required_proc;
5774 
5775  // reduce the free minimum processor count required for the rest of the
5776  // part by 1.
5777  --minimum_num_procs_required_for_rest_of_parts;
5778 
5779  // part (i) is assigned to (required_proc) processors.
5780  num_procs_assigned_to_each_part[i] = required_proc;
5781 
5782  // because of the roundings some processors might be left as unassigned.
5783  // we want to assign those processors to the part with most imbalance.
5784  // find the part with the maximum imbalance here.
5785  double imbalance_wrt_ideal =
5786  (scalar_required_proc - required_proc) / required_proc;
5787  if(imbalance_wrt_ideal > max_imbalance_difference) {
5788  max_imbalance_difference = imbalance_wrt_ideal;
5789  max_differing_part = i;
5790  }
5791  }
5792 
5793  // assign extra processors to the part with maximum imbalance
5794  // than the ideal.
5795  if(num_free_procs > 0) {
5796  num_procs_assigned_to_each_part[max_differing_part] += num_free_procs;
5797  }
5798 
5799  // now find what are the best processors with least migration for each part.
5800 
5801  // part_assignment_proc_begin_indices ([i]) is the array that holds the
5802  // beginning index of a processor that processor sends its data for part - i
5803  mj_part_t *part_assignment_proc_begin_indices = new mj_part_t[num_parts];
5804 
5805  // the next processor send is found in processor_chains_in_parts,
5806  // in linked list manner.
5807  mj_part_t *processor_chains_in_parts = new mj_part_t [num_procs];
5808  mj_part_t *processor_part_assignments = new mj_part_t[num_procs];
5809 
5810  // initialize the assignment of each processor.
5811  // this has a linked list implementation.
5812  // the beginning of processors assigned
5813  // to each part is hold at part_assignment_proc_begin_indices[part].
5814  // then the next processor assigned to that part is located at
5815  // proc_part_assignments[part_assign_begins[part]], this is a chain
5816  // until the value of -1 is reached.
5817  for(int i = 0; i < num_procs; ++i ) {
5818  processor_part_assignments[i] = -1;
5819  processor_chains_in_parts[i] = -1;
5820  }
5821  for(int i = 0; i < num_parts; ++i ) {
5822  part_assignment_proc_begin_indices[i] = -1;
5823  }
5824 
5825  // std::cout << "Before migration: mig type:" <<
5826  // this->migration_type << std::endl;
5827  // Allocate memory for sorting data structure.
5828  uSignedSortItem<mj_part_t, mj_gno_t, char> *
5829  sort_item_num_part_points_in_procs =
5830  new uSignedSortItem<mj_part_t, mj_gno_t, char>[num_procs];
5831 
5832  for(mj_part_t i = 0; i < num_parts; ++i) {
5833  // the algorithm tries to minimize the cost of migration, by assigning the
5834  // processors with highest number of coordinates on that part.
5835  // here we might want to implement a maximum weighted bipartite matching
5836  // algorithm.
5837  for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5838  sort_item_num_part_points_in_procs[ii].id = ii;
5839  // if processor is not assigned yet.
5840  // add its num points to the sort data structure.
5841  if(processor_part_assignments[ii] == -1) {
5842  sort_item_num_part_points_in_procs[ii].val =
5843  num_points_in_all_processor_parts[ii * num_parts + i];
5844  // indicate that the processor has positive weight.
5845  sort_item_num_part_points_in_procs[ii].signbit = 1;
5846  }
5847  else {
5848  // if processor is already assigned, insert -nLocal - 1 so that it
5849  // won't be selected again.
5850  // would be same if we simply set it to -1, but more information with
5851  // no extra cost (which is used later) is provided.
5852  // sort_item_num_part_points_in_procs[ii].val =
5853  // -num_points_in_all_processor_parts[ii * num_parts + i] - 1;
5854 
5855  // UPDATE: Since above gets warning when unsigned is used to
5856  // represent, we added extra bit to as sign bit to the sort item.
5857  // It is 1 for positives, 0 for negatives.
5858  sort_item_num_part_points_in_procs[ii].val =
5859  num_points_in_all_processor_parts[ii * num_parts + i];
5860  sort_item_num_part_points_in_procs[ii].signbit = 0;
5861  }
5862  }
5863 
5864  // sort the processors in the part.
5865  uqSignsort<mj_part_t, mj_gno_t,char>
5866  (num_procs, sort_item_num_part_points_in_procs);
5867 
5868  /*
5869  for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5870  std::cout << "ii:" << ii << " " <<
5871  sort_item_num_part_points_in_procs[ii].id <<
5872  " " << sort_item_num_part_points_in_procs[ii].val <<
5873  " " << int(sort_item_num_part_points_in_procs[ii].signbit) <<
5874  std::endl;
5875  }
5876  */
5877 
5878  mj_part_t required_proc_count = num_procs_assigned_to_each_part[i];
5879  mj_gno_t total_num_points_in_part = global_num_points_in_parts[i];
5880  mj_gno_t ideal_num_points_in_a_proc = Teuchos::as<mj_gno_t>(
5881  ceil(total_num_points_in_part / double (required_proc_count)));
5882 
5883  // starts sending to least heaviest part.
5884  mj_part_t next_proc_to_send_index = num_procs - required_proc_count;
5885  mj_part_t next_proc_to_send_id =
5886  sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
5887  mj_lno_t space_left_in_sent_proc = ideal_num_points_in_a_proc -
5888  sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
5889 
5890  // find the processors that will be assigned to this part, which are the
5891  // heaviest non assigned processors.
5892  for(mj_part_t ii = num_procs - 1;
5893  ii >= num_procs - required_proc_count; --ii) {
5894  mj_part_t proc_id = sort_item_num_part_points_in_procs[ii].id;
5895  // assign processor to part - i.
5896  processor_part_assignments[proc_id] = i;
5897  }
5898 
5899  bool did_change_sign = false;
5900  // if processor has a minus count, reverse it.
5901  for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5902  // TODO: THE LINE BELOW PRODUCES A WARNING IF gno_t IS UNSIGNED
5903  // TODO: SEE BUG 6194
5904  if(sort_item_num_part_points_in_procs[ii].signbit == 0) {
5905  did_change_sign = true;
5906  sort_item_num_part_points_in_procs[ii].signbit = 1;
5907  }
5908  else {
5909  break;
5910  }
5911  }
5912 
5913  if(did_change_sign) {
5914  // resort the processors in the part for the rest of the processors that
5915  // is not assigned.
5916  uqSignsort<mj_part_t, mj_gno_t>(num_procs - required_proc_count,
5917  sort_item_num_part_points_in_procs);
5918  }
5919 
5920  /*
5921  for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5922  std::cout << "after resort ii:" << ii << " " <<
5923  sort_item_num_part_points_in_procs[ii].id <<
5924  " " << sort_item_num_part_points_in_procs[ii].val <<
5925  " " << int(sort_item_num_part_points_in_procs[ii].signbit ) <<
5926  std::endl;
5927  }
5928  */
5929 
5930  // check if this processors is one of the procs assigned to this part.
5931  // if it is, then get the group.
5932  if(!did_i_find_my_group) {
5933  for(mj_part_t ii = num_procs - 1; ii >=
5934  num_procs - required_proc_count; --ii) {
5935 
5936  mj_part_t proc_id_to_assign = sort_item_num_part_points_in_procs[ii].id;
5937 
5938  // add the proc to the group.
5939  processor_ranks_for_subcomm.push_back(proc_id_to_assign);
5940 
5941  if(proc_id_to_assign == this->myRank) {
5942  // if the assigned process is me, then I find my group.
5943  did_i_find_my_group = true;
5944 
5945  // set the beginning of part i to my rank.
5946  part_assignment_proc_begin_indices[i] = this->myRank;
5947  processor_chains_in_parts[this->myRank] = -1;
5948 
5949  // set send count to myself to the number of points that I have
5950  // in part i.
5951  send_count_to_each_proc[this->myRank] =
5952  sort_item_num_part_points_in_procs[ii].val;
5953 
5954  // calculate the shift required for the
5955  // output_part_numbering_begin_index
5956  for(mj_part_t in = 0; in < i; ++in) {
5957  output_part_numbering_begin_index +=
5958  (*next_future_num_parts_in_parts)[in];
5959  }
5960  out_part_index = i;
5961  }
5962  }
5963 
5964  // if these was not my group,
5965  // clear the subcomminicator processor array.
5966  if(!did_i_find_my_group) {
5967  processor_ranks_for_subcomm.clear();
5968  }
5969  }
5970 
5971  // send points of the nonassigned coordinates to the assigned coordinates.
5972  // starts from the heaviest nonassigned processor.
5973  // TODO we might want to play with this part, that allows more
5974  // computational imbalance but having better communication balance.
5975  for(mj_part_t ii = num_procs - required_proc_count - 1; ii >= 0; --ii) {
5976  mj_part_t nonassigned_proc_id =
5977  sort_item_num_part_points_in_procs[ii].id;
5978  mj_lno_t num_points_to_sent =
5979  sort_item_num_part_points_in_procs[ii].val;
5980 
5981  // we set number of points to -to_sent - 1 for the assigned processors.
5982  // we reverse it here. This should not happen, as we have already
5983  // reversed them above.
5984 #ifdef MJ_DEBUG
5985  if(num_points_to_sent < 0) {
5986  cout << "Migration - processor assignments - for part:" << i
5987  << "from proc:" << nonassigned_proc_id << " num_points_to_sent:"
5988  << num_points_to_sent << std::endl;
5989  std::terminate();
5990  }
5991 #endif
5992 
5993  switch (migration_type) {
5994  case 0:
5995  {
5996  // now sends the points to the assigned processors.
5997  while (num_points_to_sent > 0) {
5998  // if the processor has enough space.
5999  if(num_points_to_sent <= space_left_in_sent_proc) {
6000  // reduce the space left in the processor.
6001  space_left_in_sent_proc -= num_points_to_sent;
6002  // if my rank is the one that is sending the coordinates.
6003  if(this->myRank == nonassigned_proc_id) {
6004  // set my sent count to the sent processor.
6005  send_count_to_each_proc[next_proc_to_send_id] =
6006  num_points_to_sent;
6007  // save the processor in the list (processor_chains_in_parts
6008  // and part_assignment_proc_begin_indices)
6009  // that the processor will send its point in part-i.
6010  mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
6011  part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
6012  processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
6013  }
6014  num_points_to_sent = 0;
6015  }
6016  else {
6017  // there might be no space left in the processor.
6018  if(space_left_in_sent_proc > 0) {
6019  num_points_to_sent -= space_left_in_sent_proc;
6020 
6021  //send as the space left in the processor.
6022  if(this->myRank == nonassigned_proc_id) {
6023  // send as much as the space in this case.
6024  send_count_to_each_proc[next_proc_to_send_id] =
6025  space_left_in_sent_proc;
6026  mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
6027  part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
6028  processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
6029  }
6030  }
6031  // change the sent part
6032  ++next_proc_to_send_index;
6033 
6034 #ifdef MJ_DEBUG
6035  if(next_part_to_send_index < nprocs - required_proc_count ) {
6036  cout << "Migration - processor assignments - for part:"
6037  << i
6038  << " next_part_to_send :" << next_part_to_send_index
6039  << " nprocs:" << nprocs
6040  << " required_proc_count:" << required_proc_count
6041  << " Error: next_part_to_send_index <" <<
6042  << " nprocs - required_proc_count" << std::endl;
6043  std::terminate();
6044  }
6045 #endif
6046  // send the new id.
6047  next_proc_to_send_id =
6048  sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
6049  // set the new space in the processor.
6050  space_left_in_sent_proc = ideal_num_points_in_a_proc -
6051  sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
6052  }
6053  }
6054  }
6055  break;
6056  default:
6057  {
6058  // to minimize messages, we want each processor to send its
6059  // coordinates to only a single point.
6060  // we do not respect imbalances here, we send all points to the
6061  // next processor.
6062  if(this->myRank == nonassigned_proc_id) {
6063  // set my sent count to the sent processor.
6064  send_count_to_each_proc[next_proc_to_send_id] = num_points_to_sent;
6065  // save the processor in the list (processor_chains_in_parts and
6066  // part_assignment_proc_begin_indices)
6067  // that the processor will send its point in part-i.
6068  mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
6069  part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
6070  processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
6071  }
6072  num_points_to_sent = 0;
6073  ++next_proc_to_send_index;
6074 
6075  // if we made it to the heaviest processor we round robin and
6076  // go to beginning
6077  if(next_proc_to_send_index == num_procs) {
6078  next_proc_to_send_index = num_procs - required_proc_count;
6079  }
6080  // send the new id.
6081  next_proc_to_send_id =
6082  sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
6083  // set the new space in the processor.
6084  space_left_in_sent_proc = ideal_num_points_in_a_proc -
6085  sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
6086  }
6087  }
6088  }
6089  }
6090 
6091  /*
6092  for(int i = 0; i < num_procs;++i) {
6093  std::cout << "me:" << this->myRank << " to part:" << i << " sends:" <<
6094  send_count_to_each_proc[i] << std::endl;
6095  }
6096  */
6097 
6098  this->assign_send_destinations(
6099  num_parts,
6100  part_assignment_proc_begin_indices,
6101  processor_chains_in_parts,
6102  send_count_to_each_proc,
6103  coordinate_destinations);
6104  delete [] part_assignment_proc_begin_indices;
6105  delete [] processor_chains_in_parts;
6106  delete [] processor_part_assignments;
6107  delete [] sort_item_num_part_points_in_procs;
6108  delete [] num_procs_assigned_to_each_part;
6109 }
6110 
6126 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6127  typename mj_part_t, typename mj_node_t>
6128 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6129  assign_send_destinations2(
6130  mj_part_t num_parts,
6131  uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment,
6132  int *coordinate_destinations,
6133  mj_part_t &output_part_numbering_begin_index,
6134  std::vector<mj_part_t> *next_future_num_parts_in_parts)
6135 {
6136  mj_part_t part_shift_amount = output_part_numbering_begin_index;
6137  mj_part_t previous_processor = -1;
6138 
6139  auto local_new_part_xadj = Kokkos::create_mirror_view(this->new_part_xadj);
6140  Kokkos::deep_copy(local_new_part_xadj, this->new_part_xadj);
6141 
6142  auto local_new_coordinate_permutations =
6143  Kokkos::create_mirror_view(this->new_coordinate_permutations);
6144  Kokkos::deep_copy(local_new_coordinate_permutations,
6145  this->new_coordinate_permutations);
6146 
6147  for(mj_part_t i = 0; i < num_parts; ++i) {
6148  mj_part_t p = sort_item_part_to_proc_assignment[i].id;
6149 
6150  // assigned processors are sorted.
6151  mj_lno_t part_begin_index = 0;
6152 
6153  if(p > 0) {
6154  part_begin_index = local_new_part_xadj(p - 1);
6155  }
6156 
6157  mj_lno_t part_end_index = local_new_part_xadj(p);
6158 
6159  mj_part_t assigned_proc = sort_item_part_to_proc_assignment[i].val;
6160  if(this->myRank == assigned_proc && previous_processor != assigned_proc) {
6161  output_part_numbering_begin_index = part_shift_amount;
6162  }
6163  previous_processor = assigned_proc;
6164  part_shift_amount += (*next_future_num_parts_in_parts)[p];
6165 
6166  for(mj_lno_t j= part_begin_index; j < part_end_index; j++) {
6167  mj_lno_t localInd = local_new_coordinate_permutations(j);
6168  coordinate_destinations[localInd] = assigned_proc;
6169  }
6170  }
6171 }
6172 
6194 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6195  typename mj_part_t, typename mj_node_t>
6196 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6197  mj_assign_parts_to_procs(
6198  mj_gno_t * num_points_in_all_processor_parts,
6199  mj_part_t num_parts,
6200  mj_part_t num_procs,
6201  mj_lno_t *send_count_to_each_proc,
6202  std::vector<mj_part_t> *next_future_num_parts_in_parts,
6203  mj_part_t &out_num_part,
6204  std::vector<mj_part_t> &out_part_indices,
6205  mj_part_t &output_part_numbering_begin_index,
6206  int *coordinate_destinations) {
6207 
6208  out_num_part = 0;
6209  mj_gno_t *global_num_points_in_parts =
6210  num_points_in_all_processor_parts + num_procs * num_parts;
6211  out_part_indices.clear();
6212 
6213  // to sort the parts that is assigned to the processors.
6214  // id is the part number, sort value is the assigned processor id.
6215  uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment =
6216  new uSortItem<mj_part_t, mj_part_t>[num_parts];
6217  uSortItem<mj_part_t, mj_gno_t> * sort_item_num_points_of_proc_in_part_i =
6218  new uSortItem<mj_part_t, mj_gno_t>[num_procs];
6219 
6220  // calculate the optimal number of coordinates that should be assigned
6221  // to each processor.
6222  mj_lno_t work_each =
6223  mj_lno_t (this->num_global_coords / (double (num_procs)) + 0.5f);
6224 
6225  // to hold the left space as the number of coordinates to the optimal
6226  // number in each proc.
6227  mj_lno_t *space_in_each_processor = new mj_lno_t[num_procs];
6228 
6229  // initialize left space in each.
6230  for(mj_part_t i = 0; i < num_procs; ++i) {
6231  space_in_each_processor[i] = work_each;
6232  }
6233 
6234  // we keep track of how many parts each processor is assigned to.
6235  // because in some weird inputs, it might be possible that some
6236  // processors is not assigned to any part. Using these variables,
6237  // we force each processor to have at least one part.
6238  mj_part_t *num_parts_proc_assigned = new mj_part_t[num_procs];
6239  memset(num_parts_proc_assigned, 0, sizeof(mj_part_t) * num_procs);
6240  int empty_proc_count = num_procs;
6241 
6242  // to sort the parts with decreasing order of their coordiantes.
6243  // id are the part numbers, sort value is the number of points in each.
6244  uSortItem<mj_part_t, mj_gno_t> * sort_item_point_counts_in_parts =
6245  new uSortItem<mj_part_t, mj_gno_t>[num_parts];
6246 
6247  // initially we will sort the parts according to the number of coordinates
6248  // they have, so that we will start assigning with the part that has the most
6249  // number of coordinates.
6250  for(mj_part_t i = 0; i < num_parts; ++i) {
6251  sort_item_point_counts_in_parts[i].id = i;
6252  sort_item_point_counts_in_parts[i].val = global_num_points_in_parts[i];
6253  }
6254 
6255  // sort parts with increasing order of loads.
6256  uqsort<mj_part_t, mj_gno_t>(num_parts, sort_item_point_counts_in_parts);
6257 
6258  // assigning parts to the processors
6259  // traverse the part with decreasing order of load.
6260  // first assign the heaviest part.
6261  for(mj_part_t j = 0; j < num_parts; ++j) {
6262  // sorted with increasing order, traverse inverse.
6263  mj_part_t i = sort_item_point_counts_in_parts[num_parts - 1 - j].id;
6264 
6265  // load of the part
6266  mj_gno_t load = global_num_points_in_parts[i];
6267 
6268  // assigned processors
6269  mj_part_t assigned_proc = -1;
6270 
6271  // sort processors with increasing number of points in this part.
6272  for(mj_part_t ii = 0; ii < num_procs; ++ii) {
6273  sort_item_num_points_of_proc_in_part_i[ii].id = ii;
6274 
6275  // if there are still enough parts to fill empty processors, than proceed
6276  // normally, but if empty processor count is equal to the number of part,
6277  // then we force to part assignments only to empty processors.
6278  if(empty_proc_count < num_parts - j ||
6279  num_parts_proc_assigned[ii] == 0) {
6280  // how many points processor ii has in part i?
6281  sort_item_num_points_of_proc_in_part_i[ii].val =
6282  num_points_in_all_processor_parts[ii * num_parts + i];
6283  }
6284  else {
6285  sort_item_num_points_of_proc_in_part_i[ii].val = -1;
6286  }
6287  }
6288 
6289  uqsort<mj_part_t, mj_gno_t>(num_procs,
6290  sort_item_num_points_of_proc_in_part_i);
6291 
6292  // traverse all processors with decreasing load.
6293  for(mj_part_t iii = num_procs - 1; iii >= 0; --iii) {
6294  mj_part_t ii = sort_item_num_points_of_proc_in_part_i[iii].id;
6295  if(assigned_proc == -1 ||
6296  (space_in_each_processor[ii] > space_in_each_processor[assigned_proc])) {
6297  assigned_proc = ii;
6298  }
6299  else if(space_in_each_processor[ii] == space_in_each_processor[assigned_proc]) {
6300  if(ii < assigned_proc) {
6301  // ties go to lower proc
6302  // not necessary for a valid result but allows testing to compare
6303  // MPI results and have parts numbers assigned to the same boxes.
6304  // We don't break here because we may have more ties still to check.
6305  // The indeterminate state before this is due to Cuda using
6306  // atomics to refill the permutation array. So non-cuda runs don't
6307  // actualy need this since they will always have the same pattern.
6308  assigned_proc = ii;
6309  }
6310  }
6311  else {
6312  break; // now we can break - we have our part and no more ties.
6313  }
6314  }
6315 
6316  if(num_parts_proc_assigned[assigned_proc]++ == 0) {
6317  --empty_proc_count;
6318  }
6319 
6320  space_in_each_processor[assigned_proc] -= load;
6321  //to sort later, part-i is assigned to the proccessor - assignment.
6322  sort_item_part_to_proc_assignment[j].id = i; //part i
6323 
6324  // assigned to processor - assignment.
6325  sort_item_part_to_proc_assignment[j].val = assigned_proc;
6326 
6327  // if assigned processor is me, increase the number.
6328  if(assigned_proc == this->myRank) {
6329  out_num_part++;//assigned_part_count;
6330  out_part_indices.push_back(i);
6331  }
6332 
6333  // increase the send to that processor by the number of points in that
6334  // part, as everyone send their coordiantes in this part to the
6335  // processor assigned to this part.
6336  send_count_to_each_proc[assigned_proc] +=
6337  num_points_in_all_processor_parts[this->myRank * num_parts + i];
6338  }
6339 
6340  delete [] num_parts_proc_assigned;
6341  delete [] sort_item_num_points_of_proc_in_part_i;
6342  delete [] sort_item_point_counts_in_parts;
6343  delete [] space_in_each_processor;
6344 
6345  // sort assignments with respect to the assigned processors.
6346  uqsort<mj_part_t, mj_part_t>(num_parts, sort_item_part_to_proc_assignment);
6347 
6348  // fill sendBuf.
6349  this->assign_send_destinations2(
6350  num_parts,
6351  sort_item_part_to_proc_assignment,
6352  coordinate_destinations,
6353  output_part_numbering_begin_index,
6354  next_future_num_parts_in_parts);
6355 
6356  delete [] sort_item_part_to_proc_assignment;
6357 }
6358 
6359 
6383 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6384  typename mj_part_t, typename mj_node_t>
6385 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6386  mj_migration_part_proc_assignment(
6387  mj_gno_t * num_points_in_all_processor_parts,
6388  mj_part_t num_parts,
6389  mj_part_t num_procs,
6390  mj_lno_t *send_count_to_each_proc,
6391  std::vector<mj_part_t> &processor_ranks_for_subcomm,
6392  std::vector<mj_part_t> *next_future_num_parts_in_parts,
6393  mj_part_t &out_num_part,
6394  std::vector<mj_part_t> &out_part_indices,
6395  mj_part_t &output_part_numbering_begin_index,
6396  int *coordinate_destinations)
6397 {
6398  processor_ranks_for_subcomm.clear();
6399  // if(this->num_local_coords > 0)
6400  if(num_procs > num_parts) {
6401  // if there are more processors than the number of current part
6402  // then processors share the existing parts.
6403  // at the end each processor will have a single part,
6404  // but a part will be shared by a group of processors.
6405  mj_part_t out_part_index = 0;
6406 
6407  this->mj_assign_proc_to_parts(
6408  num_points_in_all_processor_parts,
6409  num_parts,
6410  num_procs,
6411  send_count_to_each_proc,
6412  processor_ranks_for_subcomm,
6413  next_future_num_parts_in_parts,
6414  out_part_index,
6415  output_part_numbering_begin_index,
6416  coordinate_destinations
6417  );
6418 
6419  out_num_part = 1;
6420  out_part_indices.clear();
6421  out_part_indices.push_back(out_part_index);
6422  }
6423  else {
6424 
6425  // there are more parts than the processors.
6426  // therefore a processor will be assigned multiple parts,
6427  // the subcommunicators will only have a single processor.
6428  processor_ranks_for_subcomm.push_back(this->myRank);
6429 
6430  // since there are more parts then procs,
6431  // assign multiple parts to processors.
6432 
6433  this->mj_assign_parts_to_procs(
6434  num_points_in_all_processor_parts,
6435  num_parts,
6436  num_procs,
6437  send_count_to_each_proc,
6438  next_future_num_parts_in_parts,
6439  out_num_part,
6440  out_part_indices,
6441  output_part_numbering_begin_index,
6442  coordinate_destinations);
6443  }
6444 }
6445 
6459 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6460  typename mj_part_t, typename mj_node_t>
6461 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6462  mj_migrate_coords(
6463  mj_part_t num_procs,
6464  mj_lno_t &num_new_local_points,
6465  std::string iteration,
6466  int *coordinate_destinations,
6467  mj_part_t num_parts)
6468 {
6469 
6470 #ifdef ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
6471  if(sizeof(mj_lno_t) <= sizeof(int)) {
6472  // Cannot use Zoltan_Comm with local ordinals larger than ints.
6473  // In Zoltan_Comm_Create, the cast int(this->num_local_coords)
6474  // may overflow.
6475  ZOLTAN_COMM_OBJ *plan = NULL;
6476  MPI_Comm mpi_comm = Teuchos::getRawMpiComm(*(this->comm));
6477  int num_incoming_gnos = 0;
6478  int message_tag = 7859;
6479 
6480  this->mj_env->timerStart(MACRO_TIMERS,
6481  mj_timer_base_string + "Migration Z1PlanCreating-" + iteration);
6482  int ierr = Zoltan_Comm_Create(
6483  &plan,
6484  int(this->num_local_coords),
6485  coordinate_destinations,
6486  mpi_comm,
6487  message_tag,
6488  &num_incoming_gnos);
6489 
6490  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6491  this->mj_env->timerStop(MACRO_TIMERS,
6492  mj_timer_base_string + "Migration Z1PlanCreating-" + iteration);
6493 
6494  this->mj_env->timerStart(MACRO_TIMERS,
6495  mj_timer_base_string + "Migration Z1Migration-" + iteration);
6496 
6497  // MPI Buffers should be on Kokkos::HostSpace not Kokkos::CudaUVMSpace
6498  // Note, with UVM space, create_mirror_view does NOT create a non-UVM
6499  // view; need the explicit Host creation and deep_copy.
6500 
6501  // migrate gnos.
6502  {
6503  auto host_current_mj_gnos = Kokkos::create_mirror_view(
6504  Kokkos::HostSpace(), this->current_mj_gnos);
6505  Kokkos::deep_copy(host_current_mj_gnos, this->current_mj_gnos);
6506  Kokkos::View<mj_gno_t*, device_t> dst_gnos(
6507  Kokkos::ViewAllocateWithoutInitializing("dst_gnos"), num_incoming_gnos);
6508  auto host_dst_gnos = Kokkos::create_mirror_view(
6509  Kokkos::HostSpace(), dst_gnos);
6510  message_tag++;
6511  ierr = Zoltan_Comm_Do(
6512  plan,
6513  message_tag,
6514  (char *) host_current_mj_gnos.data(),
6515  sizeof(mj_gno_t),
6516  (char *) host_dst_gnos.data());
6517  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6518  Kokkos::deep_copy(dst_gnos, host_dst_gnos);
6519  this->current_mj_gnos = dst_gnos;
6520  }
6521 
6522  //migrate coordinates
6523  {
6524  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
6525  auto host_src_coordinates = Kokkos::create_mirror_view(
6526  Kokkos::HostSpace(), this->mj_coordinates);
6527  Kokkos::deep_copy(host_src_coordinates, this->mj_coordinates);
6528  Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
6529  dst_coordinates(Kokkos::ViewAllocateWithoutInitializing("mj_coordinates"),
6530  num_incoming_gnos, this->coord_dim);
6531  auto host_dst_coordinates = Kokkos::create_mirror_view(
6532  Kokkos::HostSpace(), dst_coordinates);
6533  for(int i = 0; i < this->coord_dim; ++i) {
6534  Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> sub_host_src_coordinates
6535  = Kokkos::subview(host_src_coordinates, Kokkos::ALL, i);
6536  Kokkos::View<mj_scalar_t *, Kokkos::HostSpace> sub_host_dst_coordinates
6537  = Kokkos::subview(host_dst_coordinates, Kokkos::ALL, i);
6538  // Note Layout Left means we can do these in contiguous blocks
6539  message_tag++;
6540  ierr = Zoltan_Comm_Do(
6541  plan,
6542  message_tag,
6543  (char *) sub_host_src_coordinates.data(),
6544  sizeof(mj_scalar_t),
6545  (char *) sub_host_dst_coordinates.data());
6546  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6547  }
6548  deep_copy(dst_coordinates, host_dst_coordinates);
6549  this->mj_coordinates = dst_coordinates;
6550  }
6551 
6552  // migrate weights.
6553  {
6554  auto host_src_weights = Kokkos::create_mirror_view(
6555  Kokkos::HostSpace(), this->mj_weights);
6556  Kokkos::deep_copy(host_src_weights, this->mj_weights);
6557  Kokkos::View<mj_scalar_t**, device_t> dst_weights(
6558  Kokkos::ViewAllocateWithoutInitializing("mj_weights"),
6559  num_incoming_gnos, this->num_weights_per_coord);
6560  auto host_dst_weights = Kokkos::create_mirror_view(dst_weights);
6561  for(int i = 0; i < this->num_weights_per_coord; ++i) {
6562  auto sub_host_src_weights
6563  = Kokkos::subview(host_src_weights, Kokkos::ALL, i);
6564  auto sub_host_dst_weights
6565  = Kokkos::subview(host_dst_weights, Kokkos::ALL, i);
6566  ArrayRCP<mj_scalar_t> sent_weight(this->num_local_coords);
6567  // Copy because of layout
6568  for(mj_lno_t n = 0; n < this->num_local_coords; ++n) {
6569  sent_weight[n] = sub_host_src_weights(n);
6570  }
6571  ArrayRCP<mj_scalar_t> received_weight(num_incoming_gnos);
6572  message_tag++;
6573  ierr = Zoltan_Comm_Do(
6574  plan,
6575  message_tag,
6576  (char *) sent_weight.getRawPtr(),
6577  sizeof(mj_scalar_t),
6578  (char *) received_weight.getRawPtr());
6579  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6580  // Again we copy by index due to layout
6581  for(mj_lno_t n = 0; n < num_incoming_gnos; ++n) {
6582  sub_host_dst_weights(n) = received_weight[n];
6583  }
6584  }
6585  deep_copy(dst_weights, host_dst_weights);
6586  this->mj_weights = dst_weights;
6587  }
6588 
6589  // migrate owners.
6590  {
6591  // Note that owners we kept on Serial
6592  Kokkos::View<int *, Kokkos::HostSpace> dst_owners_of_coordinate(
6593  Kokkos::ViewAllocateWithoutInitializing("owner_of_coordinate"),
6594  num_incoming_gnos);
6595  message_tag++;
6596  ierr = Zoltan_Comm_Do(
6597  plan,
6598  message_tag,
6599  (char *) owner_of_coordinate.data(),
6600  sizeof(int),
6601  (char *) dst_owners_of_coordinate.data());
6602  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6603  this->owner_of_coordinate = dst_owners_of_coordinate;
6604  }
6605 
6606  // if num procs is less than num parts,
6607  // we need the part assigment arrays as well, since
6608  // there will be multiple parts in processor.
6609  {
6610  auto host_src_assigned_part_ids = Kokkos::create_mirror_view(
6611  Kokkos::HostSpace(), this->assigned_part_ids);
6612  Kokkos::deep_copy(host_src_assigned_part_ids, this->assigned_part_ids);
6613  Kokkos::View<int *, device_t> dst_assigned_part_ids(
6614  Kokkos::ViewAllocateWithoutInitializing("assigned_part_ids"),
6615  num_incoming_gnos);
6616  auto host_dst_assigned_part_ids = Kokkos::create_mirror_view(
6617  Kokkos::HostSpace(), dst_assigned_part_ids);
6618  mj_part_t *new_parts = new mj_part_t[num_incoming_gnos];
6619  if(num_procs < num_parts) {
6620  message_tag++;
6621  ierr = Zoltan_Comm_Do(
6622  plan,
6623  message_tag,
6624  (char *) host_src_assigned_part_ids.data(),
6625  sizeof(mj_part_t),
6626  (char *) host_dst_assigned_part_ids.data());
6627  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6628  Kokkos::deep_copy(dst_assigned_part_ids, host_dst_assigned_part_ids);
6629  }
6630  // In original code this would just assign to an uninitialized array
6631  // if num_procs < num_parts. We're doing the same here.
6632  this->assigned_part_ids = dst_assigned_part_ids;
6633  }
6634 
6635  ierr = Zoltan_Comm_Destroy(&plan);
6636  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6637  num_new_local_points = num_incoming_gnos;
6638  this->mj_env->timerStop(MACRO_TIMERS,
6639  mj_timer_base_string + "Migration Z1Migration-" + iteration);
6640  }
6641  else
6642 #endif // ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
6643  {
6644  this->mj_env->timerStart(MACRO_TIMERS, mj_timer_base_string +
6645  "Migration DistributorPlanCreating-" + iteration);
6646 
6647  Tpetra::Distributor distributor(this->comm);
6648  ArrayView<const mj_part_t> destinations( coordinate_destinations,
6649  this->num_local_coords);
6650  mj_lno_t num_incoming_gnos = distributor.createFromSends(destinations);
6651  this->mj_env->timerStop(MACRO_TIMERS, mj_timer_base_string +
6652  "Migration DistributorPlanCreating-" + iteration);
6653  this->mj_env->timerStart(MACRO_TIMERS, mj_timer_base_string +
6654  "Migration DistributorMigration-" + iteration);
6655 
6656  // note MPI buffers should all be on Kokkos::HostSpace and not
6657  // Kokkos::CudaUVMSpace.
6658  // Note, with UVM space, create_mirror_view does NOT create a non-UVM
6659  // view; need the explicit Host creation and deep_copy.
6660  // migrate gnos.
6661  {
6662  Kokkos::View<mj_gno_t*, Kokkos::HostSpace> received_gnos(
6663  Kokkos::ViewAllocateWithoutInitializing("received_gnos"),
6664  num_incoming_gnos);
6665 
6666  Kokkos::View<mj_gno_t*, Kokkos::HostSpace> sent_gnos(
6667  Kokkos::ViewAllocateWithoutInitializing("sent_gnos"),
6668  this->current_mj_gnos.extent(0));
6669  Kokkos::deep_copy(sent_gnos, this->current_mj_gnos);
6670 
6671  distributor.doPostsAndWaits(sent_gnos, 1, received_gnos);
6672 
6673  this->current_mj_gnos = Kokkos::View<mj_gno_t*, device_t>(
6674  Kokkos::ViewAllocateWithoutInitializing("gids"), num_incoming_gnos);
6675 
6676  Kokkos::deep_copy(this->current_mj_gnos, received_gnos);
6677  }
6678 
6679  // migrate coordinates
6680  // coordinates in MJ are LayoutLeft since Tpetra Multivector is LayoutLeft
6681  Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
6682  dst_coordinates("mj_coordinates", num_incoming_gnos, this->coord_dim);
6683 
6684  Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, Kokkos::HostSpace>
6685  host_src_coordinates(
6686  Kokkos::ViewAllocateWithoutInitializing("host_coords"),
6687  this->mj_coordinates.extent(0), this->mj_coordinates.extent(1));
6688  Kokkos::deep_copy(host_src_coordinates, this->mj_coordinates);
6689 
6690  Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> received_coord(
6691  Kokkos::ViewAllocateWithoutInitializing("received_coord"),
6692  num_incoming_gnos);
6693 
6694  for(int i = 0; i < this->coord_dim; ++i) {
6695 
6696  // Note Layout Left means we can do these in contiguous blocks
6697 
6698  Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> sent_coord
6699  = Kokkos::subview(host_src_coordinates, Kokkos::ALL, i);
6700 
6701  distributor.doPostsAndWaits(sent_coord, 1, received_coord);
6702 
6703  Kokkos::deep_copy(Kokkos::subview(dst_coordinates, Kokkos::ALL, i),
6704  received_coord);
6705 
6706  // Kokkos::deep_copy will fence, I think, so it should be safe
6707  // to reuse received_coord in the next lop iteration
6708  }
6709  this->mj_coordinates = dst_coordinates;
6710 
6711  // migrate weights.
6712  Kokkos::View<mj_scalar_t**, device_t> dst_weights(
6713  "mj_weights", num_incoming_gnos, this->num_weights_per_coord);
6714  auto host_dst_weights = Kokkos::create_mirror_view(Kokkos::HostSpace(),
6715  dst_weights);
6716 
6717  auto host_src_weights = Kokkos::create_mirror_view_and_copy(
6718  Kokkos::HostSpace(), this->mj_weights);
6719 
6720  // contiguous buffers to gather potentially strided data
6721  Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> sent_weight(
6722  Kokkos::ViewAllocateWithoutInitializing("send_weight_buffer"),
6723  this->num_local_coords);
6724 
6725  Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> received_weight(
6726  Kokkos::ViewAllocateWithoutInitializing("received_weight_buffer"),
6727  num_incoming_gnos);
6728 
6729  for(int i = 0; i < this->num_weights_per_coord; ++i) {
6730 
6731  auto sub_host_src_weights
6732  = Kokkos::subview(host_src_weights, Kokkos::ALL, i);
6733 
6734  auto sub_host_dst_weights
6735  = Kokkos::subview(host_dst_weights, Kokkos::ALL, i);
6736 
6737 
6738  // Layout Right means the weights are not contiguous
6739  // However we don't have any systems setup with more than 1 weight so
6740  // really I have not tested any of this code with num weights > 1.
6741  // I think this is the right thing to do.
6742  for(mj_lno_t n = 0; n < this->num_local_coords; ++n) {
6743  sent_weight[n] = sub_host_src_weights(n);
6744  }
6745 
6746  distributor.doPostsAndWaits(sent_weight, 1, received_weight);
6747 
6748  // Again we copy by index due to layout
6749  for(mj_lno_t n = 0; n < num_incoming_gnos; ++n) {
6750  sub_host_dst_weights(n) = received_weight[n];
6751  }
6752  }
6753  Kokkos::deep_copy(dst_weights, host_dst_weights);
6754  this->mj_weights = dst_weights;
6755 
6756  // migrate owners
6757  {
6758  // Note owners we kept on Serial
6759  Kokkos::View<int *, Kokkos::HostSpace> received_owners(
6760  Kokkos::ViewAllocateWithoutInitializing("owner_of_coordinate"),
6761  num_incoming_gnos);
6762 
6763  distributor.doPostsAndWaits(owner_of_coordinate, 1, received_owners);
6764 
6765  this->owner_of_coordinate = received_owners;
6766  }
6767 
6768  // if num procs is less than num parts,
6769  // we need the part assigment arrays as well, since
6770  // there will be multiple parts in processor.
6771  if(num_procs < num_parts) {
6772  Kokkos::View<mj_part_t*, Kokkos::HostSpace> sent_partids(
6773  Kokkos::ViewAllocateWithoutInitializing("host_parts"),
6774  this->assigned_part_ids.extent(0));
6775  Kokkos::deep_copy(sent_partids, assigned_part_ids);
6776 
6777  Kokkos::View<mj_part_t*, Kokkos::HostSpace> received_partids(
6778  Kokkos::ViewAllocateWithoutInitializing("received_partids"),
6779  num_incoming_gnos);
6780 
6781  distributor.doPostsAndWaits(sent_partids, 1, received_partids);
6782 
6783  this->assigned_part_ids = Kokkos::View<mj_part_t *, device_t>
6784  ("assigned_part_ids", num_incoming_gnos);
6785  Kokkos::deep_copy(this->assigned_part_ids, received_partids);
6786  }
6787  else {
6788  this->assigned_part_ids = Kokkos::View<mj_part_t *, device_t>
6789  ("assigned_part_ids", num_incoming_gnos);
6790  }
6791  this->mj_env->timerStop(MACRO_TIMERS, "" + mj_timer_base_string +
6792  "Migration DistributorMigration-" + iteration);
6793 
6794  num_new_local_points = num_incoming_gnos;
6795  }
6796 }
6797 
6803 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6804  typename mj_part_t, typename mj_node_t>
6805 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6806  create_sub_communicator(std::vector<mj_part_t> &processor_ranks_for_subcomm)
6807 {
6808  mj_part_t group_size = processor_ranks_for_subcomm.size();
6809  mj_part_t *ids = new mj_part_t[group_size];
6810  for(mj_part_t i = 0; i < group_size; ++i) {
6811  ids[i] = processor_ranks_for_subcomm[i];
6812  }
6813  ArrayView<const mj_part_t> idView(ids, group_size);
6814  this->comm = this->comm->createSubcommunicator(idView);
6815  delete [] ids;
6816 }
6817 
6823 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6824  typename mj_part_t, typename mj_node_t>
6825 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6826  fill_permutation_array(
6827  mj_part_t output_num_parts,
6828  mj_part_t num_parts)
6829 {
6830  // if there is single output part, then simply fill the permutation array.
6831  if(output_num_parts == 1) {
6832  auto local_new_coordinate_permutations = this->new_coordinate_permutations;
6833  Kokkos::parallel_for(
6834  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
6835  (0, this->num_local_coords),
6836  KOKKOS_LAMBDA(mj_lno_t i) {
6837  local_new_coordinate_permutations(i) = i;
6838  });
6839  auto local_new_part_xadj = this->new_part_xadj;
6840  auto local_num_local_coords = this->num_local_coords;
6841  Kokkos::parallel_for(
6842  Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0,1),
6843  KOKKOS_LAMBDA(int dummy) {
6844  local_new_part_xadj(0) = local_num_local_coords;
6845  });
6846  }
6847  else {
6848  auto local_num_local_coords = this->num_local_coords;
6849  auto local_assigned_part_ids = this->assigned_part_ids;
6850  auto local_new_part_xadj = this->new_part_xadj;
6851  auto local_new_coordinate_permutations = this->new_coordinate_permutations;
6852 
6853  // part shift holds the which part number an old part number corresponds to.
6854  Kokkos::View<mj_part_t*, device_t> part_shifts("part_shifts", num_parts);
6855 
6856  // otherwise we need to count how many points are there in each part.
6857  // we allocate here as num_parts, because the sent partids are up to
6858  // num_parts, although there are outout_num_parts different part.
6859  Kokkos::View<mj_lno_t*, device_t> num_points_in_parts(
6860  "num_points_in_parts", num_parts);
6861 
6862  Kokkos::parallel_for(
6863  Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0,1),
6864  KOKKOS_LAMBDA(int dummy) {
6865 
6866  for(mj_lno_t i = 0; i < local_num_local_coords; ++i) {
6867  mj_part_t ii = local_assigned_part_ids(i);
6868  ++num_points_in_parts(ii);
6869  }
6870 
6871  // write the end points of the parts.
6872  mj_part_t p = 0;
6873  mj_lno_t prev_index = 0;
6874  for(mj_part_t i = 0; i < num_parts; ++i) {
6875  if(num_points_in_parts(i) > 0) {
6876  local_new_part_xadj(p) = prev_index + num_points_in_parts(i);
6877  prev_index += num_points_in_parts(i);
6878  part_shifts(i) = p++;
6879  }
6880  }
6881 
6882  // for the rest of the parts write the end index as end point.
6883  mj_part_t assigned_num_parts = p - 1;
6884  for(;p < num_parts; ++p) {
6885  local_new_part_xadj(p) =
6886  local_new_part_xadj(assigned_num_parts);
6887  }
6888  for(mj_part_t i = 0; i < output_num_parts; ++i) {
6889  num_points_in_parts(i) = local_new_part_xadj(i);
6890  }
6891 
6892  // write the permutation array here.
6893  // get the part of the coordinate i, shift it to obtain the new part number.
6894  // assign it to the end of the new part numbers pointer.
6895  for(mj_lno_t i = local_num_local_coords - 1; i >= 0; --i) {
6896  mj_part_t part =
6897  part_shifts[mj_part_t(local_assigned_part_ids(i))];
6898  local_new_coordinate_permutations(--num_points_in_parts[part]) = i;
6899  }
6900  });
6901  }
6902 }
6903 
6928 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6929  typename mj_part_t, typename mj_node_t>
6930 bool AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6931  mj_perform_migration(
6932  mj_part_t input_num_parts,
6933  mj_part_t &output_num_parts,
6934  std::vector<mj_part_t> *next_future_num_parts_in_parts,
6935  mj_part_t &output_part_begin_index,
6936  size_t migration_reduce_all_population,
6937  mj_lno_t num_coords_for_last_dim_part,
6938  std::string iteration,
6939  RCP<mj_partBoxVector_t> &input_part_boxes,
6940  RCP<mj_partBoxVector_t> &output_part_boxes)
6941 {
6942  mj_part_t num_procs = this->comm->getSize();
6943  this->myRank = this->comm->getRank();
6944 
6945  // this array holds how many points each processor has in each part.
6946  // to access how many points processor i has on part j,
6947  // num_points_in_all_processor_parts[i * num_parts + j]
6948  mj_gno_t *num_points_in_all_processor_parts =
6949  new mj_gno_t[input_num_parts * (num_procs + 1)];
6950 
6951  // get the number of coordinates in each part in each processor.
6952  this->get_processor_num_points_in_parts(
6953  num_procs,
6954  input_num_parts,
6955  num_points_in_all_processor_parts);
6956 
6957  // check if migration will be performed or not.
6958  if(!this->mj_check_to_migrate(
6959  migration_reduce_all_population,
6960  num_coords_for_last_dim_part,
6961  num_procs,
6962  input_num_parts,
6963  num_points_in_all_processor_parts)) {
6964  delete [] num_points_in_all_processor_parts;
6965  return false;
6966  }
6967 
6968  mj_lno_t *send_count_to_each_proc = NULL;
6969  int *coordinate_destinations = new int[this->num_local_coords];
6970  send_count_to_each_proc = new mj_lno_t[num_procs];
6971 
6972  for(int i = 0; i < num_procs; ++i) {
6973  send_count_to_each_proc[i] = 0;
6974  }
6975 
6976  std::vector<mj_part_t> processor_ranks_for_subcomm;
6977  std::vector<mj_part_t> out_part_indices;
6978 
6979  // determine which processors are assigned to which parts
6980  this->mj_migration_part_proc_assignment(
6981  num_points_in_all_processor_parts,
6982  input_num_parts,
6983  num_procs,
6984  send_count_to_each_proc,
6985  processor_ranks_for_subcomm,
6986  next_future_num_parts_in_parts,
6987  output_num_parts,
6988  out_part_indices,
6989  output_part_begin_index,
6990  coordinate_destinations);
6991 
6992  delete [] send_count_to_each_proc;
6993  std::vector <mj_part_t> tmpv;
6994 
6995  std::sort (out_part_indices.begin(), out_part_indices.end());
6996  mj_part_t outP = out_part_indices.size();
6997  mj_gno_t new_global_num_points = 0;
6998  mj_gno_t *global_num_points_in_parts =
6999  num_points_in_all_processor_parts + num_procs * input_num_parts;
7000 
7001  if(this->mj_keep_part_boxes) {
7002  input_part_boxes->clear();
7003  }
7004 
7005  // now we calculate the new values for next_future_num_parts_in_parts.
7006  // same for the part boxes.
7007  for(mj_part_t i = 0; i < outP; ++i) {
7008  mj_part_t ind = out_part_indices[i];
7009  new_global_num_points += global_num_points_in_parts[ind];
7010  tmpv.push_back((*next_future_num_parts_in_parts)[ind]);
7011  if(this->mj_keep_part_boxes) {
7012  input_part_boxes->push_back((*output_part_boxes)[ind]);
7013  }
7014  }
7015 
7016  // swap the input and output part boxes.
7017  if(this->mj_keep_part_boxes) {
7018  RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
7019  input_part_boxes = output_part_boxes;
7020  output_part_boxes = tmpPartBoxes;
7021  }
7022  next_future_num_parts_in_parts->clear();
7023  for(mj_part_t i = 0; i < outP; ++i) {
7024  mj_part_t p = tmpv[i];
7025  next_future_num_parts_in_parts->push_back(p);
7026  }
7027 
7028  delete [] num_points_in_all_processor_parts;
7029 
7030  mj_lno_t num_new_local_points = 0;
7031  //perform the actual migration operation here.
7032  this->mj_migrate_coords(
7033  num_procs,
7034  num_new_local_points,
7035  iteration,
7036  coordinate_destinations,
7037  input_num_parts);
7038 
7039  delete [] coordinate_destinations;
7040  if(this->num_local_coords != num_new_local_points) {
7041  this->new_coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>
7042  (Kokkos::ViewAllocateWithoutInitializing("new_coordinate_permutations"),
7043  num_new_local_points);
7044  this->coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>
7045  (Kokkos::ViewAllocateWithoutInitializing("coordinate_permutations"),
7046  num_new_local_points);
7047  }
7048  this->num_local_coords = num_new_local_points;
7049  this->num_global_coords = new_global_num_points;
7050 
7051  // create subcommunicator.
7052  this->create_sub_communicator(processor_ranks_for_subcomm);
7053 
7054  processor_ranks_for_subcomm.clear();
7055 
7056  // fill the new permutation arrays.
7057  this->fill_permutation_array(output_num_parts, input_num_parts);
7058 
7059  return true;
7060 }
7061 
7080 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
7081  typename mj_part_t, typename mj_node_t>
7082 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
7083  create_consistent_chunks(
7084  mj_part_t num_parts,
7085  Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
7086  Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
7087  mj_lno_t coordinate_begin,
7088  mj_lno_t coordinate_end,
7089  Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
7090  Kokkos::View<mj_lno_t *, device_t> & out_part_xadj,
7091  int coordInd,
7092  bool longest_dim_part,
7093  uSignedSortItem<int, mj_scalar_t, char> * p_coord_dimension_range_sorted)
7094 {
7095  // Note that this method is only used by task mapper
7096  // All code in this file has been verified to run with UVM off by running
7097  // mj tests and task mapper tests with UVM off. However for this particular
7098  // method I did not do much for UVM off. I heavily use device to host copies
7099  // and more or less preserve the original logic. Due to the handling of
7100  // arrays it will be a bit of work to convert this to as better form.
7101  // Since it's only relevant to task mapper and I wasn't sure how much priority
7102  // to give it, I put that on hold until further discussion.
7103  mj_part_t no_cuts = num_parts - 1;
7104 
7105  // now if the rectilinear partitioning is allowed we decide how
7106  // much weight each thread should put to left and right.
7107  if(this->distribute_points_on_cut_lines) {
7108  auto local_thread_cut_line_weight_to_put_left =
7109  this->thread_cut_line_weight_to_put_left;
7110  auto local_thread_part_weight_work =
7111  this->thread_part_weight_work;
7112  auto local_sEpsilon = this->sEpsilon;
7113 
7114  Kokkos::parallel_for(
7115  Kokkos::RangePolicy<typename mj_node_t::execution_space,
7116  mj_part_t> (0, no_cuts), KOKKOS_LAMBDA (mj_part_t i) {
7117  // the left to be put on the left of the cut.
7118  mj_scalar_t left_weight = used_local_cut_line_weight_to_left(i);
7119  if(left_weight > local_sEpsilon) {
7120  // the weight of thread ii on cut.
7121  mj_scalar_t thread_ii_weight_on_cut =
7122  local_thread_part_weight_work(i * 2 + 1) -
7123  local_thread_part_weight_work(i * 2);
7124  if(thread_ii_weight_on_cut < left_weight) {
7125  local_thread_cut_line_weight_to_put_left(i) =
7126  thread_ii_weight_on_cut;
7127  }
7128  else {
7129  local_thread_cut_line_weight_to_put_left(i) = left_weight;
7130  }
7131  }
7132  else {
7133  local_thread_cut_line_weight_to_put_left(i) = 0;
7134  }
7135  });
7136 
7137  if(no_cuts > 0) {
7138  auto local_least_signifiance = least_signifiance;
7139  auto local_significance_mul = significance_mul;
7140  Kokkos::parallel_for(
7141  Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
7142  (0, 1), KOKKOS_LAMBDA (int dummy) {
7143  // this is a special case. If cutlines share the same coordinate,
7144  // their weights are equal.
7145  // we need to adjust the ratio for that.
7146  for(mj_part_t i = no_cuts - 1; i > 0 ; --i) {
7147  mj_scalar_t cut1 = current_concurrent_cut_coordinate(i-1);
7148  mj_scalar_t cut2 = current_concurrent_cut_coordinate(i);
7149  mj_scalar_t delta = cut2 - cut1;
7150  mj_scalar_t abs_delta = (delta > 0) ? delta : -delta;
7151  if(abs_delta < local_sEpsilon) {
7152  local_thread_cut_line_weight_to_put_left(i) -=
7153  local_thread_cut_line_weight_to_put_left(i - 1);
7154  }
7155  local_thread_cut_line_weight_to_put_left(i) =
7156  static_cast<long long>((local_thread_cut_line_weight_to_put_left(i) +
7157  local_least_signifiance) * local_significance_mul) /
7158  static_cast<mj_scalar_t>(local_significance_mul);
7159  }
7160  });
7161  }
7162  }
7163 
7164  auto local_thread_point_counts = this->thread_point_counts;
7165  Kokkos::parallel_for(
7166  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
7167  (0, num_parts), KOKKOS_LAMBDA (mj_part_t i) {
7168  local_thread_point_counts(i) = 0;
7169  });
7170 
7171  // for this specific case we dont want to distribute the points along the
7172  // cut position randomly, as we need a specific ordering of them. Instead,
7173  // we put the coordinates into a sort item, where we sort those
7174  // using the coordinates of points on other dimensions and the index.
7175 
7176  // some of the cuts might share the same position.
7177  // in this case, if cut i and cut j share the same position
7178  // cut_map[i] = cut_map[j] = sort item index.
7179  mj_part_t *cut_map = new mj_part_t[no_cuts];
7180 
7181  typedef uMultiSortItem<mj_lno_t, int, mj_scalar_t> multiSItem;
7182  typedef std::vector< multiSItem > multiSVector;
7183  typedef std::vector<multiSVector> multiS2Vector;
7184 
7185  // to keep track of the memory allocated.
7186  std::vector<mj_scalar_t *>allocated_memory;
7187 
7188  // vector for which the coordinates will be sorted.
7189  multiS2Vector sort_vector_points_on_cut;
7190 
7191  // the number of cuts that have different coordinates.
7192  mj_part_t different_cut_count = 1;
7193  cut_map[0] = 0;
7194 
7195  // now we insert 1 sort vector for all cuts on the different
7196  // positins.if multiple cuts are on the same position,
7197  // they share sort vectors.
7198  multiSVector tmpMultiSVector;
7199  sort_vector_points_on_cut.push_back(tmpMultiSVector);
7200 
7201  auto local_current_concurrent_cut_coordinate =
7202  current_concurrent_cut_coordinate;
7203  auto host_current_concurrent_cut_coordinate =
7204  Kokkos::create_mirror_view(local_current_concurrent_cut_coordinate);
7205  Kokkos::deep_copy(host_current_concurrent_cut_coordinate,
7206  local_current_concurrent_cut_coordinate);
7207 
7208  for(mj_part_t i = 1; i < no_cuts ; ++i) {
7209  // if cuts share the same cut coordinates
7210  // set the cutmap accordingly.
7211  if(std::abs(host_current_concurrent_cut_coordinate(i) -
7212  host_current_concurrent_cut_coordinate(i-1)) < this->sEpsilon) {
7213  cut_map[i] = cut_map[i-1];
7214  }
7215  else {
7216  cut_map[i] = different_cut_count++;
7217  multiSVector tmp2MultiSVector;
7218  sort_vector_points_on_cut.push_back(tmp2MultiSVector);
7219  }
7220  }
7221  Kokkos::deep_copy(current_concurrent_cut_coordinate,
7222  host_current_concurrent_cut_coordinate);
7223 
7224  // now the actual part assigment.
7225  auto host_coordinate_permutations =
7226  Kokkos::create_mirror_view(coordinate_permutations);
7227  Kokkos::deep_copy(host_coordinate_permutations, coordinate_permutations);
7228 
7229  auto host_assigned_part_ids = Kokkos::create_mirror_view(assigned_part_ids);
7230  Kokkos::deep_copy(host_assigned_part_ids, assigned_part_ids);
7231 
7232  auto host_mj_coordinates = Kokkos::create_mirror_view(mj_coordinates);
7233  Kokkos::deep_copy(host_mj_coordinates, mj_coordinates);
7234 
7235  auto host_thread_point_counts = Kokkos::create_mirror_view(thread_point_counts);
7236  Kokkos::deep_copy(host_thread_point_counts, thread_point_counts);
7237 
7238  auto local_coord_dim = this->coord_dim;
7239 
7240  for(mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii) {
7241  mj_lno_t i = host_coordinate_permutations(ii);
7242  mj_part_t pp = host_assigned_part_ids(i);
7243  mj_part_t p = pp / 2;
7244  // if the coordinate is on a cut.
7245  if(pp % 2 == 1 ) {
7246  mj_scalar_t *vals = new mj_scalar_t[local_coord_dim -1];
7247  allocated_memory.push_back(vals);
7248 
7249  // we insert the coordinates to the sort item here.
7250  int val_ind = 0;
7251 
7252  if(longest_dim_part) {
7253  // std::cout << std::endl << std::endl;
7254  for(int dim = local_coord_dim - 2; dim >= 0; --dim) {
7255  // uSignedSortItem<int, mj_scalar_t, char>
7256  // *p_coord_dimension_range_sorted
7257  int next_largest_coord_dim = p_coord_dimension_range_sorted[dim].id;
7258  // std::cout << "next_largest_coord_dim: " <<
7259  // next_largest_coord_dim << " ";
7260  // Note refactor in progress
7261  vals[val_ind++] =
7262  host_mj_coordinates(i,next_largest_coord_dim);
7263  }
7264  }
7265  else {
7266  for(int dim = coordInd + 1; dim < local_coord_dim; ++dim) {
7267  vals[val_ind++] = host_mj_coordinates(i,dim);
7268  }
7269  for(int dim = 0; dim < coordInd; ++dim) {
7270  vals[val_ind++] = host_mj_coordinates(i,dim);
7271  }
7272  }
7273 
7274  multiSItem tempSortItem(i, local_coord_dim -1, vals);
7275  //insert the point to the sort vector pointed by the cut_map[p].
7276  mj_part_t cmap = cut_map[p];
7277  sort_vector_points_on_cut[cmap].push_back(tempSortItem);
7278  }
7279  else {
7280  //if it is not on the cut, simple sorting.
7281  ++host_thread_point_counts(p);
7282  host_assigned_part_ids(i) = p;
7283  }
7284  }
7285 
7286  // sort all the sort vectors.
7287  for(mj_part_t i = 0; i < different_cut_count; ++i) {
7288  std::sort (sort_vector_points_on_cut[i].begin(),
7289  sort_vector_points_on_cut[i].end());
7290  }
7291 
7292  mj_part_t previous_cut_map = cut_map[0];
7293 
7294  auto host_thread_cut_line_weight_to_put_left =
7295  Kokkos::create_mirror_view(thread_cut_line_weight_to_put_left);
7296  Kokkos::deep_copy(host_thread_cut_line_weight_to_put_left,
7297  thread_cut_line_weight_to_put_left);
7298 
7299  auto host_mj_weights = Kokkos::create_mirror_view(mj_weights);
7300  Kokkos::deep_copy(host_mj_weights, mj_weights);
7301 
7302  // this is how much previous part owns the weight of the current part.
7303  // when target part weight is 1.6, and the part on the left is given 2,
7304  // the left has an extra 0.4, while the right has missing 0.4 from the
7305  // previous cut.
7306  // This parameter is used to balance this issues.
7307  // in the above example weight_stolen_from_previous_part will be 0.4.
7308  // if the left part target is 2.2 but it is given 2,
7309  // then weight_stolen_from_previous_part will be -0.2.
7310  mj_scalar_t weight_stolen_from_previous_part = 0;
7311  for(mj_part_t p = 0; p < no_cuts; ++p) {
7312  mj_part_t mapped_cut = cut_map[p];
7313 
7314  // if previous cut map is done, and it does not have the same index,
7315  // then assign all points left on that cut to its right.
7316  if(previous_cut_map != mapped_cut) {
7317  mj_lno_t sort_vector_end = (mj_lno_t)
7318  sort_vector_points_on_cut[previous_cut_map].size() - 1;
7319  for(; sort_vector_end >= 0; --sort_vector_end) {
7320  multiSItem t =
7321  sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
7322  mj_lno_t i = t.index;
7323  ++host_thread_point_counts(p);
7324  host_assigned_part_ids(i) = p;
7325  }
7326  sort_vector_points_on_cut[previous_cut_map].clear();
7327  }
7328 
7329  // TODO: MD: I dont remember why I have it reverse order here.
7330  mj_lno_t sort_vector_end = (mj_lno_t)
7331  sort_vector_points_on_cut[mapped_cut].size() - 1;
7332  // mj_lno_t sort_vector_begin= 0;
7333  // mj_lno_t sort_vector_size =
7334  // (mj_lno_t)sort_vector_points_on_cut[mapped_cut].size();
7335 
7336  // TODO commented for reverse order
7337  for(; sort_vector_end >= 0; --sort_vector_end) {
7338  // for(; sort_vector_begin < sort_vector_size; ++sort_vector_begin) {
7339  // TODO COMMENTED FOR REVERSE ORDER
7340  multiSItem t = sort_vector_points_on_cut[mapped_cut][sort_vector_end];
7341  //multiSItem t = sort_vector_points_on_cut[mapped_cut][sort_vector_begin];
7342  mj_lno_t i = t.index;
7343  mj_scalar_t w = this->mj_uniform_weights(0) ? 1 :
7344  this->mj_weights(i,0);
7345  // part p has enough space for point i, then put it to point i.
7346  if(host_thread_cut_line_weight_to_put_left(p) +
7347  weight_stolen_from_previous_part> this->sEpsilon &&
7348  host_thread_cut_line_weight_to_put_left(p) +
7349  weight_stolen_from_previous_part -
7350  std::abs(host_thread_cut_line_weight_to_put_left(p) +
7351  weight_stolen_from_previous_part - w)> this->sEpsilon)
7352  {
7353  host_thread_cut_line_weight_to_put_left(p) -= w;
7354 
7355  sort_vector_points_on_cut[mapped_cut].pop_back();
7356 
7357  ++host_thread_point_counts(p);
7358  host_assigned_part_ids(i) = p;
7359  // if putting this weight to left overweights the left cut, then
7360  // increase the space for the next cut using
7361  // weight_stolen_from_previous_part.
7362  if(p < no_cuts - 1 &&
7363  host_thread_cut_line_weight_to_put_left(p) < this->sEpsilon) {
7364  if(mapped_cut == cut_map[p + 1] ) {
7365  // if the cut before the cut indexed at p was also at the same
7366  // position special case, as we handle the weight differently here.
7367  if(previous_cut_map != mapped_cut) {
7368  weight_stolen_from_previous_part =
7369  host_thread_cut_line_weight_to_put_left(p);
7370  }
7371  else {
7372  // if the cut before the cut indexed at p was also at the same
7373  // position we assign extra weights cumulatively in this case.
7374  weight_stolen_from_previous_part +=
7375  host_thread_cut_line_weight_to_put_left(p);
7376  }
7377  }
7378  else{
7379  weight_stolen_from_previous_part =
7380  -host_thread_cut_line_weight_to_put_left(p);
7381  }
7382  // end assignment for part p
7383  break;
7384  }
7385  } else {
7386  // if part p does not have enough space for this point
7387  // and if there is another cut sharing the same positon,
7388  // again increase the space for the next
7389  if(p < no_cuts - 1 && mapped_cut == cut_map[p + 1]) {
7390  if(previous_cut_map != mapped_cut) {
7391  weight_stolen_from_previous_part =
7392  host_thread_cut_line_weight_to_put_left(p);
7393  }
7394  else {
7395  weight_stolen_from_previous_part +=
7396  host_thread_cut_line_weight_to_put_left(p);
7397  }
7398  }
7399  else{
7400  weight_stolen_from_previous_part =
7401  -host_thread_cut_line_weight_to_put_left(p);
7402  }
7403  // end assignment for part p
7404  break;
7405  }
7406  }
7407  previous_cut_map = mapped_cut;
7408  }
7409 
7410  // TODO commented for reverse order
7411  // put everything left on the last cut to the last part.
7412  mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[
7413  previous_cut_map].size() - 1;
7414 
7415  // mj_lno_t sort_vector_begin= 0;
7416  // mj_lno_t sort_vector_size = (mj_lno_t)
7417  // sort_vector_points_on_cut[previous_cut_map].size();
7418  // TODO commented for reverse order
7419  for(; sort_vector_end >= 0; --sort_vector_end) {
7420  // TODO commented for reverse order
7421  multiSItem t = sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
7422  // multiSItem t =
7423  // sort_vector_points_on_cut[previous_cut_map][sort_vector_begin];
7424  mj_lno_t i = t.index;
7425  ++host_thread_point_counts(no_cuts);
7426  host_assigned_part_ids(i) = no_cuts;
7427  }
7428 
7429  sort_vector_points_on_cut[previous_cut_map].clear();
7430  delete [] cut_map;
7431 
7432  //free the memory allocated for vertex sort items .
7433  mj_lno_t vSize = (mj_lno_t) allocated_memory.size();
7434  for(mj_lno_t i = 0; i < vSize; ++i) {
7435  delete [] allocated_memory[i];
7436  }
7437 
7438  auto local_out_part_xadj = out_part_xadj;
7439  auto host_out_part_xadj = Kokkos::create_mirror_view(local_out_part_xadj);
7440  Kokkos::deep_copy(host_out_part_xadj, out_part_xadj);
7441 
7442  // creation of part_xadj as in usual case.
7443  for(mj_part_t j = 0; j < num_parts; ++j) {
7444  host_out_part_xadj(j) = host_thread_point_counts(j);
7445  host_thread_point_counts(j) = 0;
7446  }
7447 
7448  // perform prefix sum for num_points in parts.
7449  for(mj_part_t j = 1; j < num_parts; ++j) {
7450  host_out_part_xadj(j) += host_out_part_xadj(j - 1);
7451  }
7452 
7453  // shift the num points in threads thread to obtain the
7454  // beginning index of each thread's private space.
7455  for(mj_part_t j = 1; j < num_parts; ++j) {
7456  host_thread_point_counts(j) += host_out_part_xadj(j - 1);
7457  }
7458 
7459  auto host_new_coordinate_permutations =
7460  Kokkos::create_mirror_view(new_coordinate_permutations);
7461  Kokkos::deep_copy(host_new_coordinate_permutations,
7462  new_coordinate_permutations);
7463 
7464  // now thread gets the coordinate and writes the index of coordinate to
7465  // the permutation array using the part index we calculated.
7466  for(mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii) {
7467  mj_lno_t i = host_coordinate_permutations(ii);
7468  mj_part_t p = host_assigned_part_ids(i);
7469  host_new_coordinate_permutations(coordinate_begin +
7470  host_thread_point_counts(p)++) = i;
7471  }
7472 
7473  Kokkos::deep_copy(thread_point_counts, host_thread_point_counts);
7474  Kokkos::deep_copy(new_coordinate_permutations,
7475  host_new_coordinate_permutations);
7476  Kokkos::deep_copy(local_out_part_xadj, host_out_part_xadj);
7477 }
7478 
7488 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
7489  typename mj_part_t, typename mj_node_t>
7490 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
7491  set_final_parts(
7492  mj_part_t current_num_parts,
7493  mj_part_t output_part_begin_index,
7494  RCP<mj_partBoxVector_t> &output_part_boxes,
7495  bool is_data_ever_migrated)
7496 {
7497  this->mj_env->timerStart(MACRO_TIMERS,
7498  mj_timer_base_string + "Part_Assignment");
7499 
7500  auto local_part_xadj = part_xadj;
7501  auto local_mj_keep_part_boxes = mj_keep_part_boxes;
7502  auto local_coordinate_permutations = coordinate_permutations;
7503  auto local_assigned_part_ids = assigned_part_ids;
7504 
7505  if(local_mj_keep_part_boxes) {
7506  for(int i = 0; i < current_num_parts; ++i) {
7507  (*output_part_boxes)[i].setpId(i + output_part_begin_index);
7508  }
7509  }
7510 
7511  Kokkos::TeamPolicy<typename mj_node_t::execution_space> policy(
7512  current_num_parts, Kokkos::AUTO());
7513  typedef typename Kokkos::TeamPolicy<typename mj_node_t::execution_space>::
7515  Kokkos::parallel_for(policy, KOKKOS_LAMBDA(member_type team_member) {
7516  int i = team_member.league_rank();
7517  Kokkos::parallel_for(Kokkos::TeamThreadRange (team_member, (i != 0) ?
7518  local_part_xadj(i-1) : 0, local_part_xadj(i)),
7519  [=] (mj_lno_t ii) {
7520  mj_lno_t k = local_coordinate_permutations(ii);
7521  local_assigned_part_ids(k) = i + output_part_begin_index;
7522  });
7523  });
7524 
7525  if(is_data_ever_migrated) {
7526 #ifdef ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
7527  if(sizeof(mj_lno_t) <= sizeof(int)) {
7528 
7529  // Cannot use Zoltan_Comm with local ordinals larger than ints.
7530  // In Zoltan_Comm_Create, the cast int(this->num_local_coords)
7531  // may overflow.
7532 
7533  // if data is migrated, then send part numbers to the original owners.
7534  ZOLTAN_COMM_OBJ *plan = NULL;
7535  MPI_Comm mpi_comm = Teuchos::getRawMpiComm(*(this->mj_problemComm));
7536 
7537  int incoming = 0;
7538  int message_tag = 7856;
7539 
7540  this->mj_env->timerStart(MACRO_TIMERS,
7541  mj_timer_base_string + "Final Z1PlanCreating");
7542 
7543  // setup incoming count
7544  int ierr = Zoltan_Comm_Create( &plan, int(this->num_local_coords),
7545  this->owner_of_coordinate.data(), mpi_comm, message_tag, &incoming);
7546 
7547  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
7548  this->mj_env->timerStop(MACRO_TIMERS,
7549  mj_timer_base_string + "Final Z1PlanCreating" );
7550 
7551  this->mj_env->timerStart(MACRO_TIMERS,
7552  mj_timer_base_string + "Final Z1PlanComm");
7553 
7554  // MPI Buffers should be on Kokkos::HostSpace not Kokkos::CudaUVMSpace
7555  // Note, with UVM space, create_mirror_view does NOT create a non-UVM
7556  // view; need the explicit Host creation and deep_copy.
7557 
7558  // migrate gnos to actual owners.
7559  auto host_current_mj_gnos = Kokkos::create_mirror_view(
7560  Kokkos::HostSpace(), this->current_mj_gnos);
7561  deep_copy(host_current_mj_gnos, this->current_mj_gnos);
7562  Kokkos::View<mj_gno_t*, device_t> dst_gnos(
7563  Kokkos::ViewAllocateWithoutInitializing("dst_gnos"), incoming);
7564  auto host_dst_gnos = Kokkos::create_mirror_view(
7565  Kokkos::HostSpace(), dst_gnos);
7566  message_tag++;
7567  ierr = Zoltan_Comm_Do( plan, message_tag,
7568  (char *) host_current_mj_gnos.data(),
7569  sizeof(mj_gno_t), (char *) host_dst_gnos.data());
7570  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
7571  Kokkos::deep_copy(dst_gnos, host_dst_gnos);
7572  this->current_mj_gnos = dst_gnos;
7573 
7574  // migrate part ids to actual owners.
7575  auto host_src_part_ids = Kokkos::create_mirror_view(
7576  Kokkos::HostSpace(), this->assigned_part_ids);
7577  deep_copy(host_src_part_ids, this->assigned_part_ids);
7578  Kokkos::View<mj_part_t*, device_t> dst_part_ids(
7579  Kokkos::ViewAllocateWithoutInitializing("dst_part_ids"), incoming);
7580  auto host_dst_part_ids = Kokkos::create_mirror_view(
7581  Kokkos::HostSpace(), dst_part_ids);
7582  message_tag++;
7583  ierr = Zoltan_Comm_Do( plan, message_tag,
7584  (char *) host_src_part_ids.data(),
7585  sizeof(mj_part_t), (char *) host_dst_part_ids.data());
7586  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
7587  Kokkos::deep_copy(dst_part_ids, host_dst_part_ids);
7588  this->assigned_part_ids = dst_part_ids;
7589 
7590  ierr = Zoltan_Comm_Destroy(&plan);
7591  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
7592 
7593  this->num_local_coords = incoming;
7594 
7595  this->mj_env->timerStop(MACRO_TIMERS,
7596  mj_timer_base_string + "Final Z1PlanComm");
7597  }
7598  else
7599 #endif // ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
7600  {
7601  // setup incoming count
7602  this->mj_env->timerStart(MACRO_TIMERS,
7603  mj_timer_base_string + "Final DistributorPlanCreating");
7604  Tpetra::Distributor distributor(this->mj_problemComm);
7605  ArrayView<const mj_part_t> owners_of_coords(
7606  this->owner_of_coordinate.data(), this->num_local_coords);
7607  mj_lno_t incoming = distributor.createFromSends(owners_of_coords);
7608  this->mj_env->timerStop(MACRO_TIMERS,
7609  mj_timer_base_string + "Final DistributorPlanCreating" );
7610 
7611  this->mj_env->timerStart(MACRO_TIMERS,
7612  mj_timer_base_string + "Final DistributorPlanComm");
7613 
7614  // migrate gnos to actual owners.
7615  // MPI buffers should be Kokkos::HostSpace, not Kokkos::CudaUVMSpace
7616  // Note, with UVM space, create_mirror_view does NOT create a non-UVM
7617  // view; need the explicit Host creation and deep_copy.
7618  Kokkos::View<mj_gno_t*, Kokkos::HostSpace> sent_gnos(
7619  Kokkos::ViewAllocateWithoutInitializing("sent_gnos"),
7620  this->current_mj_gnos.extent(0));
7621  Kokkos::deep_copy(sent_gnos, this->current_mj_gnos);
7622 
7623  Kokkos::View<mj_gno_t*, Kokkos::HostSpace> received_gnos(
7624  Kokkos::ViewAllocateWithoutInitializing("received_gnos"),
7625  incoming);
7626 
7627  distributor.doPostsAndWaits(sent_gnos, 1, received_gnos);
7628 
7629  this->current_mj_gnos = Kokkos::View<mj_gno_t*, device_t>(
7630  Kokkos::ViewAllocateWithoutInitializing("current_mj_gnos"), incoming);
7631 
7632  Kokkos::deep_copy(this->current_mj_gnos, received_gnos);
7633 
7634  // migrate part ids to actual owners.
7635  Kokkos::View<mj_part_t *, Kokkos::HostSpace> sent_partids(
7636  Kokkos::ViewAllocateWithoutInitializing("sent_partids"),
7637  this->assigned_part_ids.extent(0));
7638  Kokkos::deep_copy(sent_partids, this->assigned_part_ids);
7639 
7640  Kokkos::View<mj_part_t *, Kokkos::HostSpace> received_partids(
7641  Kokkos::ViewAllocateWithoutInitializing("received_partids"),
7642  incoming);
7643 
7644  distributor.doPostsAndWaits(sent_partids, 1, received_partids);
7645 
7646  this->assigned_part_ids =
7647  Kokkos::View<mj_part_t*, device_t>(
7648  Kokkos::ViewAllocateWithoutInitializing("assigned_part_ids"),
7649  incoming);
7650 
7651  Kokkos::deep_copy(this->assigned_part_ids, received_partids);
7652  this->num_local_coords = incoming;
7653 
7654  this->mj_env->timerStop(MACRO_TIMERS,
7655  mj_timer_base_string + "Final DistributorPlanComm");
7656  }
7657  }
7658 
7659  this->mj_env->timerStop(MACRO_TIMERS,
7660  mj_timer_base_string + "Part_Assignment");
7661 
7662  this->mj_env->timerStart(MACRO_TIMERS,
7663  mj_timer_base_string + "Solution_Part_Assignment");
7664 
7665  // ArrayRCP<mj_part_t> partId;
7666  // partId = arcp(this->assigned_part_ids, 0, this->num_local_coords, true);
7667 
7668  if(this->mj_keep_part_boxes) {
7669  this->kept_boxes = compute_global_box_boundaries(output_part_boxes);
7670  }
7671 
7672  this->mj_env->timerStop(MACRO_TIMERS,
7673  mj_timer_base_string + "Solution_Part_Assignment");
7674 }
7675 
7688 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
7689  typename mj_part_t, typename mj_node_t>
7692  bool distribute_points_on_cut_lines_,
7693  int max_concurrent_part_calculation_,
7694  int check_migrate_avoid_migration_option_,
7695  double minimum_migration_imbalance_,
7696  int migration_type_)
7697 {
7698  this->distribute_points_on_cut_lines = distribute_points_on_cut_lines_;
7699  this->max_concurrent_part_calculation = max_concurrent_part_calculation_;
7700  this->check_migrate_avoid_migration_option =
7701  check_migrate_avoid_migration_option_;
7702  this->minimum_migration_imbalance = minimum_migration_imbalance_;
7703  this->migration_type = migration_type_;
7704 }
7705 
7733 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
7734  typename mj_part_t, typename mj_node_t>
7737  const RCP<const Environment> &env,
7738  RCP<const Comm<int> > &problemComm,
7739  double imbalance_tolerance_,
7740  int num_teams_,
7741  size_t num_global_parts_,
7742  Kokkos::View<mj_part_t*, Kokkos::HostSpace> & part_no_array_,
7743  int recursion_depth_,
7744  int coord_dim_,
7745  mj_lno_t num_local_coords_,
7746  mj_gno_t num_global_coords_,
7747  Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos_,
7748  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
7749  Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> & mj_coordinates_,
7750  int num_weights_per_coord_,
7751  Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_weights_,
7752  Kokkos::View<mj_scalar_t**, device_t> & mj_weights_,
7753  Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_parts_,
7754  Kokkos::View<mj_part_t *, device_t> & result_assigned_part_ids_,
7755  Kokkos::View<mj_gno_t*, device_t> & result_mj_gnos_)
7756 {
7757 
7758  // see comment above for Zoltan2_AlgMJ_TrackCallsCounter
7760  this->mj_timer_base_string = "MJ(" + std::to_string(execute_counter) + ") - ";
7761 
7762  this->mj_env = env;
7763  this->mj_problemComm = problemComm;
7764  this->myActualRank = this->myRank = this->mj_problemComm->getRank();
7765  this->mj_env->timerStart(MACRO_TIMERS,
7766  mj_timer_base_string + "Total");
7767  this->mj_env->debug(3, "In MultiJagged Jagged");
7768  this->imbalance_tolerance = imbalance_tolerance_;
7769  this->mj_num_teams = num_teams_;
7770  this->num_global_parts = num_global_parts_;
7771  this->part_no_array = part_no_array_;
7772  this->recursion_depth = recursion_depth_;
7773  this->coord_dim = coord_dim_;
7774  this->num_local_coords = num_local_coords_;
7775  this->num_global_coords = num_global_coords_;
7776  this->mj_coordinates = mj_coordinates_;
7777  this->initial_mj_gnos = initial_mj_gnos_;
7778  this->num_weights_per_coord = num_weights_per_coord_;
7779  this->mj_uniform_weights = mj_uniform_weights_;
7780  this->mj_weights = mj_weights_;
7781  this->mj_uniform_parts = mj_uniform_parts_;
7782 
7783  // this->set_input_data();
7784 
7785  this->set_part_specifications();
7786 
7787  this->mj_env->timerStart(MACRO_TIMERS,
7788  mj_timer_base_string + "Allocate Views");
7789  this->allocate_set_work_memory();
7790  this->mj_env->timerStop(MACRO_TIMERS,
7791  mj_timer_base_string + "Allocate Views");
7792 
7793  // We duplicate the comm as we create subcommunicators during migration.
7794  // We keep the problemComm as it is, while comm changes after each migration.
7795  this->comm = this->mj_problemComm->duplicate();
7796 
7797 #ifdef print_debug
7798  if(comm->getRank() == 0) {
7799  std::cout << "size of gno:" << sizeof(mj_gno_t) << std::endl;
7800  std::cout << "size of lno:" << sizeof(mj_lno_t) << std::endl;
7801  std::cout << "size of mj_scalar_t:" << sizeof(mj_scalar_t) << std::endl;
7802  }
7803 #endif
7804 
7805  // initially there is a single partition
7806  mj_part_t current_num_parts = 1;
7807  Kokkos::View<mj_scalar_t *, device_t> current_cut_coordinates =
7808  this->all_cut_coordinates;
7809  this->mj_env->timerStart(MACRO_TIMERS,
7810  mj_timer_base_string + "Problem_Partitioning");
7811  mj_part_t output_part_begin_index = 0;
7812  mj_part_t future_num_parts = this->total_num_part;
7813  bool is_data_ever_migrated = false;
7814 
7815  std::vector<mj_part_t> *future_num_part_in_parts =
7816  new std::vector<mj_part_t> ();
7817  std::vector<mj_part_t> *next_future_num_parts_in_parts =
7818  new std::vector<mj_part_t> ();
7819 
7820  next_future_num_parts_in_parts->push_back(this->num_global_parts);
7821 
7822  RCP<mj_partBoxVector_t> input_part_boxes;
7823  RCP<mj_partBoxVector_t> output_part_boxes;
7824 
7825  if(this->mj_keep_part_boxes) {
7826  input_part_boxes = RCP<mj_partBoxVector_t>(new mj_partBoxVector_t(), true);
7827  output_part_boxes = RCP<mj_partBoxVector_t>(new mj_partBoxVector_t(), true);
7828  compute_global_box();
7829  this->init_part_boxes(output_part_boxes);
7830  }
7831 
7832  auto local_part_xadj = this->part_xadj;
7833 
7834  // Need a device counter - how best to allocate?
7835  // Putting this allocation in the loops is very costly so moved out here.
7836  Kokkos::View<mj_part_t*, device_t>
7837  view_rectilinear_cut_count("view_rectilinear_cut_count", 1);
7838  Kokkos::View<size_t*, device_t>
7839  view_total_reduction_size("view_total_reduction_size", 1);
7840 
7841  for(int i = 0; i < this->recursion_depth; ++i) {
7842 
7843  // convert i to string to be used for debugging purposes.
7844  std::string istring = std::to_string(i);
7845 
7846  // next_future_num_parts_in_parts will be as the size of outnumParts,
7847  // and this will hold how many more parts that each output part
7848  // should be divided. this array will also be used to determine the weight
7849  // ratios of the parts. swap the arrays to use iteratively.
7850  std::vector<mj_part_t> *tmpPartVect= future_num_part_in_parts;
7851  future_num_part_in_parts = next_future_num_parts_in_parts;
7852  next_future_num_parts_in_parts = tmpPartVect;
7853 
7854  // clear next_future_num_parts_in_parts array as
7855  // getPartitionArrays expects it to be empty.
7856  next_future_num_parts_in_parts->clear();
7857  if(this->mj_keep_part_boxes) {
7858  RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
7859  input_part_boxes = output_part_boxes;
7860  output_part_boxes = tmpPartBoxes;
7861  output_part_boxes->clear();
7862  }
7863 
7864  // returns the total no. of output parts for this dimension partitioning.
7865  mj_part_t output_part_count_in_dimension =
7866  this->update_part_num_arrays(
7867  future_num_part_in_parts,
7868  next_future_num_parts_in_parts,
7869  future_num_parts,
7870  current_num_parts,
7871  i,
7872  input_part_boxes,
7873  output_part_boxes, 1);
7874 
7875  // if the number of obtained parts equal to current number of parts,
7876  // skip this dimension. For example, this happens when 1 is given in the
7877  // input part array is given. P=4,5,1,2
7878  if(output_part_count_in_dimension == current_num_parts) {
7879  //still need to swap the input output arrays.
7880  tmpPartVect= future_num_part_in_parts;
7881  future_num_part_in_parts = next_future_num_parts_in_parts;
7882  next_future_num_parts_in_parts = tmpPartVect;
7883 
7884  if(this->mj_keep_part_boxes) {
7885  RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
7886  input_part_boxes = output_part_boxes;
7887  output_part_boxes = tmpPartBoxes;
7888  }
7889  continue;
7890  }
7891 
7892  // get the coordinate axis along which the partitioning will be done.
7893  int coordInd = i % this->coord_dim;
7894 
7895  Kokkos::View<mj_scalar_t *, device_t> mj_current_dim_coords =
7896  Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
7897 
7898  this->mj_env->timerStart(MACRO_TIMERS,
7899  mj_timer_base_string + "Problem_Partitioning_" + istring);
7900 
7901  // alloc Memory to point the indices
7902  // of the parts in the permutation array.
7903  this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
7904  "new part xadj", output_part_count_in_dimension);
7905 
7906  // the index where in the new_part_xadj will be written.
7907  mj_part_t output_part_index = 0;
7908 
7909  // whatever is written to output_part_index will be added with
7910  // output_coordinate_end_index so that the points will be shifted.
7911  mj_part_t output_coordinate_end_index = 0;
7912 
7913  mj_part_t current_work_part = 0;
7914  mj_part_t current_concurrent_num_parts =
7915  std::min(current_num_parts - current_work_part,
7916  this->max_concurrent_part_calculation);
7917 
7918  mj_part_t obtained_part_index = 0;
7919 
7920  auto host_process_local_min_max_coord_total_weight =
7921  Kokkos::create_mirror_view(process_local_min_max_coord_total_weight);
7922  auto host_global_min_max_coord_total_weight =
7923  Kokkos::create_mirror_view(global_min_max_coord_total_weight);
7924 
7925  // run for all available parts.
7926  for(; current_work_part < current_num_parts;
7927  current_work_part += current_concurrent_num_parts) {
7928 
7929  current_concurrent_num_parts =
7930  std::min(current_num_parts - current_work_part,
7931  this->max_concurrent_part_calculation);
7932 
7933  int bDoingWork_int; // Can't reduce on bool so use int
7934  auto local_device_num_partitioning_in_current_dim =
7935  device_num_partitioning_in_current_dim;
7936  Kokkos::parallel_reduce("Read bDoingWork",
7937  Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
7938  KOKKOS_LAMBDA(int dummy, int & set_single) {
7939  set_single = 0;
7940  for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
7941  if(local_device_num_partitioning_in_current_dim(
7942  current_work_part + kk) != 1) {
7943  set_single = 1;
7944  break;
7945  }
7946  }
7947  }, bDoingWork_int);
7948  bool bDoingWork = (bDoingWork_int != 0) ? true : false;
7949 
7950  this->mj_get_local_min_max_coord_totW(
7951  current_work_part,
7952  current_concurrent_num_parts,
7953  mj_current_dim_coords);
7954 
7955  // 1D partitioning
7956  if(bDoingWork) {
7957  // obtain global Min max of the part.
7958  this->mj_get_global_min_max_coord_totW(
7959  current_concurrent_num_parts,
7960  this->process_local_min_max_coord_total_weight,
7961  this->global_min_max_coord_total_weight);
7962 
7963  // represents the total number of cutlines
7964  // whose coordinate should be determined.
7965  mj_part_t total_incomplete_cut_count = 0;
7966 
7967  // Compute weight ratios for parts & cuts:
7968  // e.g., 0.25 0.25 0.5 0.5 0.75 0.75 1
7969  // part0 cut0 part1 cut1 part2 cut2 part3
7970  mj_part_t concurrent_part_cut_shift = 0;
7971  mj_part_t concurrent_part_part_shift = 0;
7972 
7973  for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
7974 
7975  Kokkos::deep_copy(host_global_min_max_coord_total_weight,
7976  global_min_max_coord_total_weight);
7977 
7978  mj_scalar_t min_coordinate =
7979  host_global_min_max_coord_total_weight(kk);
7980  mj_scalar_t max_coordinate =
7981  host_global_min_max_coord_total_weight(
7982  kk + current_concurrent_num_parts);
7983 
7984  mj_scalar_t global_total_weight =
7985  host_global_min_max_coord_total_weight(
7986  kk + 2 * current_concurrent_num_parts);
7987 
7988  mj_part_t concurrent_current_part_index = current_work_part + kk;
7989 
7990  mj_part_t partition_count = host_num_partitioning_in_current_dim(
7991  concurrent_current_part_index);
7992 
7993  Kokkos::View<mj_scalar_t *, device_t> usedCutCoordinate =
7994  Kokkos::subview(current_cut_coordinates,
7995  std::pair<mj_lno_t, mj_lno_t>(
7996  concurrent_part_cut_shift, current_cut_coordinates.size()));
7997  Kokkos::View<mj_scalar_t *, device_t>
7998  current_target_part_weights =
7999  Kokkos::subview(target_part_weights,
8000  std::pair<mj_lno_t, mj_lno_t>(
8001  concurrent_part_part_shift, target_part_weights.size()));
8002 
8003  // shift the usedCutCoordinate array as noCuts.
8004  concurrent_part_cut_shift += partition_count - 1;
8005  // shift the partRatio array as noParts.
8006  concurrent_part_part_shift += partition_count;
8007 
8008  // calculate only if part is not empty,
8009  // and part will be further partitioned.
8010  if(partition_count > 1 && min_coordinate <= max_coordinate) {
8011 
8012  // increase num_cuts_do_be_determined by the number of cuts of the
8013  // current part's cut line number.
8014  total_incomplete_cut_count += partition_count - 1;
8015 
8016  this->incomplete_cut_count(kk) = partition_count - 1;
8017 
8018  // get the target weights of the parts
8019  this->mj_get_initial_cut_coords_target_weights(
8020  min_coordinate,
8021  max_coordinate,
8022  partition_count - 1,
8023  global_total_weight,
8024  usedCutCoordinate,
8025  current_target_part_weights,
8026  future_num_part_in_parts,
8027  next_future_num_parts_in_parts,
8028  concurrent_current_part_index,
8029  obtained_part_index);
8030 
8031  mj_lno_t coordinate_end_index =
8032  host_part_xadj(concurrent_current_part_index);
8033  mj_lno_t coordinate_begin_index =
8034  concurrent_current_part_index==0 ? 0 :
8035  host_part_xadj(concurrent_current_part_index - 1);
8036 
8037  this->set_initial_coordinate_parts(
8038  max_coordinate,
8039  min_coordinate,
8040  coordinate_begin_index, coordinate_end_index,
8041  this->coordinate_permutations,
8042  mj_current_dim_coords,
8043  this->assigned_part_ids,
8044  partition_count);
8045  }
8046  else {
8047  // e.g., if have fewer coordinates than parts, don't need to do
8048  // next dim.
8049  this->incomplete_cut_count(kk) = 0;
8050  }
8051 
8052  obtained_part_index += partition_count;
8053  }
8054 
8055  // used imbalance, it is always 0, as it is difficult to
8056  // estimate a range.
8057  double used_imbalance = 0;
8058  // Determine cut lines for all concurrent parts parts here.
8059  this->mj_env->timerStart(MACRO_TIMERS,
8060  mj_timer_base_string + "Problem_Partitioning Get Part Weights");
8061 
8062  this->mj_1D_part(
8063  mj_current_dim_coords,
8064  used_imbalance,
8065  current_work_part,
8066  current_concurrent_num_parts,
8067  current_cut_coordinates,
8068  total_incomplete_cut_count,
8069  view_rectilinear_cut_count,
8070  view_total_reduction_size);
8071 
8072  this->mj_env->timerStop(MACRO_TIMERS,
8073  mj_timer_base_string + "Problem_Partitioning Get Part Weights");
8074  }
8075 
8076  // create new part chunks
8077  {
8078  mj_part_t output_array_shift = 0;
8079  mj_part_t cut_shift = 0;
8080  size_t tlr_shift = 0;
8081  size_t partweight_array_shift = 0;
8082  for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
8083 
8084  mj_part_t current_concurrent_work_part = current_work_part + kk;
8085 
8086  mj_part_t num_parts = host_num_partitioning_in_current_dim(
8087  current_concurrent_work_part);
8088 
8089  // if the part is empty, skip the part.
8090  int coordinateA_bigger_than_coordinateB =
8091  host_global_min_max_coord_total_weight(kk) >
8092  host_global_min_max_coord_total_weight(
8093  kk + current_concurrent_num_parts);
8094 
8095  if((num_parts != 1) && coordinateA_bigger_than_coordinateB) {
8096  // we still need to write the begin and end point of the empty part.
8097  // simply set it zero, the array indices will be shifted later
8098  auto local_new_part_xadj = this->new_part_xadj;
8099  Kokkos::parallel_for(
8100  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
8101  (0, num_parts), KOKKOS_LAMBDA (mj_part_t jj) {
8102  local_new_part_xadj(
8103  output_part_index + output_array_shift + jj) = 0;
8104  });
8105 
8106  cut_shift += num_parts - 1;
8107  tlr_shift += (4 *(num_parts - 1) + 1);
8108  output_array_shift += num_parts;
8109  partweight_array_shift += (2 * (num_parts - 1) + 1);
8110  continue;
8111  }
8112 
8113  Kokkos::View<mj_scalar_t *, device_t>
8114  current_concurrent_cut_coordinate =
8115  Kokkos::subview(current_cut_coordinates,
8116  std::pair<mj_lno_t, mj_lno_t>(
8117  cut_shift,
8118  current_cut_coordinates.size()));
8119  Kokkos::View<mj_scalar_t *, device_t>
8120  used_local_cut_line_weight_to_left =
8121  Kokkos::subview(process_cut_line_weight_to_put_left,
8122  std::pair<mj_lno_t, mj_lno_t>(
8123  cut_shift,
8124  process_cut_line_weight_to_put_left.size()));
8125 
8126  this->thread_part_weight_work =
8127  Kokkos::subview(
8128  this->thread_part_weights,
8129  std::pair<mj_lno_t, mj_lno_t>(
8130  partweight_array_shift,
8131  this->thread_part_weights.extent(0)));
8132 
8133  if(num_parts > 1) {
8134  if(this->mj_keep_part_boxes) {
8135  // if part boxes are to be stored update the boundaries.
8136  for(mj_part_t j = 0; j < num_parts - 1; ++j) {
8137  mj_scalar_t temp_get_val;
8138  Kokkos::parallel_reduce("Read single",
8139  Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
8140  KOKKOS_LAMBDA(int dummy, mj_scalar_t & set_single) {
8141  set_single = current_concurrent_cut_coordinate(j);
8142  }, temp_get_val);
8143  (*output_part_boxes)
8144  [output_array_shift + output_part_index + j].
8145  updateMinMax(temp_get_val, 1 /*update max*/, coordInd);
8146  (*output_part_boxes)
8147  [output_array_shift + output_part_index + j + 1].
8148  updateMinMax(temp_get_val, 0 /*update max*/, coordInd);
8149  }
8150  }
8151 
8152  // Rewrite the indices based on the computed cuts.
8153  Kokkos::View<mj_lno_t*, device_t> sub_new_part_xadj =
8154  Kokkos::subview(this->new_part_xadj,
8155  std::pair<mj_lno_t, mj_lno_t>(
8156  output_part_index + output_array_shift,
8157  this->new_part_xadj.size()));
8158 
8159  this->mj_create_new_partitions(
8160  num_parts,
8161  current_concurrent_work_part,
8162  mj_current_dim_coords,
8163  current_concurrent_cut_coordinate,
8164  used_local_cut_line_weight_to_left,
8165  sub_new_part_xadj);
8166  }
8167  else {
8168 
8169  mj_lno_t coordinate_end = host_part_xadj(
8170  current_concurrent_work_part);
8171  mj_lno_t coordinate_begin =
8172  current_concurrent_work_part==0 ? 0 : host_part_xadj(
8173  current_concurrent_work_part - 1);
8174 
8175  // if this part is partitioned into 1 then just copy
8176  // the old values.
8177  mj_lno_t part_size = coordinate_end - coordinate_begin;
8178 
8179  // Awkward here to set one value - need some broader
8180  // refactoring to improve this one.
8181  auto local_new_part_xadj = this->new_part_xadj;
8182  Kokkos::parallel_for(
8183  Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
8184  (0, 1), KOKKOS_LAMBDA (int dummy) {
8185  local_new_part_xadj(
8186  output_part_index + output_array_shift) = part_size;
8187  });
8188 
8189  auto subview_new_coordinate_permutations =
8190  Kokkos::subview(this->new_coordinate_permutations,
8191  std::pair<mj_lno_t, mj_lno_t>(
8192  coordinate_begin,
8193  coordinate_begin + part_size));
8194  auto subview_coordinate_permutations =
8195  Kokkos::subview(this->coordinate_permutations,
8196  std::pair<mj_lno_t, mj_lno_t>(
8197  coordinate_begin,
8198  coordinate_begin + part_size));
8199  Kokkos::deep_copy(subview_new_coordinate_permutations,
8200  subview_coordinate_permutations);
8201  }
8202  cut_shift += num_parts - 1;
8203  output_array_shift += num_parts;
8204  partweight_array_shift += (2 * (num_parts - 1) + 1);
8205  }
8206 
8207  // shift cut coordinates so that all cut coordinates are stored.
8208  // no shift now because we dont keep the cuts.
8209  // current_cut_coordinates += cut_shift;
8210  // mj_create_new_partitions from coordinates partitioned the parts
8211  // and write the indices as if there were a single part.
8212  // now we need to shift the beginning indices.
8213  for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
8214  mj_part_t num_parts =
8215  host_num_partitioning_in_current_dim(current_work_part + kk);
8216 
8217  // These two kernels are a bit awkward but need broader redesign to
8218  // avoid this situation.
8219  auto local_new_part_xadj = this->new_part_xadj;
8220  Kokkos::parallel_for(
8221  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
8222  (0, num_parts), KOKKOS_LAMBDA (mj_part_t ii) {
8223  local_new_part_xadj(output_part_index+ii) +=
8224  output_coordinate_end_index;
8225  });
8226 
8227  // increase the previous count by current end.
8228  mj_part_t temp_get;
8229  Kokkos::parallel_reduce("Read single",
8230  Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
8231  KOKKOS_LAMBDA(int dummy, mj_part_t & set_single) {
8232  set_single =
8233  local_new_part_xadj(output_part_index + num_parts - 1);
8234  }, temp_get);
8235  output_coordinate_end_index = temp_get;
8236  //increase the current out.
8237  output_part_index += num_parts;
8238  }
8239  }
8240  }
8241 
8242  // end of this partitioning dimension
8243  int current_world_size = this->comm->getSize();
8244  long migration_reduce_all_population =
8245  this->total_dim_num_reduce_all * current_world_size;
8246  bool is_migrated_in_current_dimension = false;
8247 
8248  // we migrate if there are more partitionings to be done after this step
8249  // and if the migration is not forced to be avoided.
8250  // and the operation is not sequential.
8251  if(future_num_parts > 1 &&
8252  this->check_migrate_avoid_migration_option >= 0 &&
8253  current_world_size > 1) {
8254  this->mj_env->timerStart(MACRO_TIMERS,
8255  mj_timer_base_string + "Problem_Migration-" + istring);
8256  mj_part_t num_parts = output_part_count_in_dimension;
8257 
8258  if(this->mj_perform_migration(
8259  num_parts,
8260  current_num_parts, //output
8261  next_future_num_parts_in_parts, //output
8262  output_part_begin_index,
8263  migration_reduce_all_population,
8264  this->num_global_coords / (future_num_parts * current_num_parts),
8265  istring,
8266  input_part_boxes, output_part_boxes) )
8267  {
8268  is_migrated_in_current_dimension = true;
8269  is_data_ever_migrated = true;
8270  this->mj_env->timerStop(MACRO_TIMERS,
8271  mj_timer_base_string + "Problem_Migration-" + istring);
8272  // since data is migrated, we reduce the number of reduceAll
8273  // operations for the last part.
8274  this->total_dim_num_reduce_all /= num_parts;
8275  }
8276  else {
8277  is_migrated_in_current_dimension = false;
8278  this->mj_env->timerStop(MACRO_TIMERS,
8279  mj_timer_base_string + "Problem_Migration-" + istring);
8280  }
8281  }
8282 
8283  // swap the coordinate permutations for the next dimension.
8284  Kokkos::View<mj_lno_t*, device_t> tmp =
8285  this->coordinate_permutations;
8286  this->coordinate_permutations =
8287  this->new_coordinate_permutations;
8288 
8289  this->new_coordinate_permutations = tmp;
8290  if(!is_migrated_in_current_dimension) {
8291  this->total_dim_num_reduce_all -= current_num_parts;
8292  current_num_parts = output_part_count_in_dimension;
8293  }
8294 
8295  {
8296  this->part_xadj = this->new_part_xadj;
8297  local_part_xadj = this->new_part_xadj;
8298  this->host_part_xadj = Kokkos::create_mirror_view(part_xadj);
8299  Kokkos::deep_copy(host_part_xadj, part_xadj); // keep in sync
8300 
8301  this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>("empty", 0);
8302  this->mj_env->timerStop(MACRO_TIMERS,
8303  mj_timer_base_string + "Problem_Partitioning_" + istring);
8304  }
8305  }
8306 
8307  // Partitioning is done
8308  delete future_num_part_in_parts;
8309  delete next_future_num_parts_in_parts;
8310  this->mj_env->timerStop(MACRO_TIMERS,
8311  mj_timer_base_string + "Problem_Partitioning");
8313 
8314  //get the final parts of each initial coordinate
8315  //the results will be written to
8316  //this->assigned_part_ids for gnos given in this->current_mj_gnos
8317  this->set_final_parts(
8318  current_num_parts,
8319  output_part_begin_index,
8320  output_part_boxes,
8321  is_data_ever_migrated);
8322 
8323  result_assigned_part_ids_ = this->assigned_part_ids;
8324  result_mj_gnos_ = this->current_mj_gnos;
8325  this->mj_env->timerStop(MACRO_TIMERS,
8326  mj_timer_base_string + "Total");
8327  this->mj_env->debug(3, "Out of MultiJagged");
8328 }
8329 
8330 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
8331  typename mj_part_t, typename mj_node_t>
8332 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t, mj_node_t>::
8333  mj_partBoxVector_t>
8336 {
8337  if(this->mj_keep_part_boxes) {
8338  return this->kept_boxes;
8339  }
8340  else {
8341  throw std::logic_error("Error: part boxes are not stored.");
8342  }
8343 }
8344 
8345 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
8346  typename mj_part_t, typename mj_node_t>
8347 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t, mj_node_t>::
8348  mj_partBoxVector_t>
8350  compute_global_box_boundaries(RCP<mj_partBoxVector_t> &localPartBoxes) const
8351 {
8352  typedef typename Zoltan2::coordinateModelPartBox::coord_t coord_t;
8353  mj_part_t ntasks = this->num_global_parts;
8354  int dim = (*localPartBoxes)[0].getDim();
8355  coord_t *localPartBoundaries = new coord_t[ntasks * 2 *dim];
8356 
8357  memset(localPartBoundaries, 0, sizeof(coord_t) * ntasks * 2 *dim);
8358 
8359  coord_t *globalPartBoundaries = new coord_t[ntasks * 2 *dim];
8360  memset(globalPartBoundaries, 0, sizeof(coord_t) * ntasks * 2 *dim);
8361 
8362  coord_t *localPartMins = localPartBoundaries;
8363  coord_t *localPartMaxs = localPartBoundaries + ntasks * dim;
8364 
8365  coord_t *globalPartMins = globalPartBoundaries;
8366  coord_t *globalPartMaxs = globalPartBoundaries + ntasks * dim;
8367 
8368  mj_part_t boxCount = localPartBoxes->size();
8369  for(mj_part_t i = 0; i < boxCount; ++i) {
8370  mj_part_t pId = (*localPartBoxes)[i].getpId();
8371 
8372  // cout << "me:" << comm->getRank() << " has:" << pId << endl;
8373 
8374  coord_t *lmins = (*localPartBoxes)[i].getlmins();
8375  coord_t *lmaxs = (*localPartBoxes)[i].getlmaxs();
8376 
8377  for(int j = 0; j < dim; ++j) {
8378  localPartMins[dim * pId + j] = lmins[j];
8379  localPartMaxs[dim * pId + j] = lmaxs[j];
8380 
8381  /*
8382  std::cout << "me:" << comm->getRank() <<
8383  " dim * pId + j:"<< dim * pId + j <<
8384  " localMin:" << localPartMins[dim * pId + j] <<
8385  " localMax:" << localPartMaxs[dim * pId + j] << std::endl;
8386  */
8387  }
8388  }
8389 
8390  Teuchos::Zoltan2_BoxBoundaries<int, coord_t> reductionOp(ntasks * 2 *dim);
8391 
8392  reduceAll<int, coord_t>(*mj_problemComm, reductionOp,
8393  ntasks * 2 *dim, localPartBoundaries, globalPartBoundaries);
8394 
8395  RCP<mj_partBoxVector_t> pB(new mj_partBoxVector_t(),true);
8396  for(mj_part_t i = 0; i < ntasks; ++i) {
8398  globalPartMins + dim * i,
8399  globalPartMaxs + dim * i);
8400 
8401  /*
8402  for(int j = 0; j < dim; ++j) {
8403  std::cout << "me:" << comm->getRank() <<
8404  " dim * pId + j:"<< dim * i + j <<
8405  " globalMin:" << globalPartMins[dim * i + j] <<
8406  " globalMax:" << globalPartMaxs[dim * i + j] << std::endl;
8407  }
8408  */
8409 
8410  pB->push_back(tpb);
8411  }
8412  delete []localPartBoundaries;
8413  delete []globalPartBoundaries;
8414  //RCP <mj_partBoxVector_t> tmpRCPBox(pB, true);
8415  return pB;
8416 }
8417 
8420 template <typename Adapter>
8421 class Zoltan2_AlgMJ : public Algorithm<Adapter>
8422 {
8423 
8424 private:
8425 
8426 #ifndef DOXYGEN_SHOULD_SKIP_THIS
8427  // For coordinates and weights, MJ needs floats or doubles
8428  // But Adapter can provide other scalars, e.g., ints.
8429  // So have separate scalar_t for MJ and adapter.
8430  typedef typename Adapter::scalar_t adapter_scalar_t;
8431 
8432  // Provide a default type for mj_scalar_t;
8433  typedef float default_mj_scalar_t;
8434 
8435  // If Adapter provided float or double scalar_t, use it (prevents copies).
8436  // Otherwise, use the default type of mj_scalar_t;
8437  typedef typename
8438  std::conditional<
8439  (std::is_same<adapter_scalar_t, float>::value ||
8440  std::is_same<adapter_scalar_t, double>::value),
8441  adapter_scalar_t, default_mj_scalar_t>::type mj_scalar_t;
8442 
8443  typedef typename Adapter::gno_t mj_gno_t;
8444  typedef typename Adapter::lno_t mj_lno_t;
8445  typedef typename Adapter::part_t mj_part_t;
8446  typedef typename Adapter::node_t mj_node_t;
8447  typedef coordinateModelPartBox mj_partBox_t;
8448  typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
8449  typedef typename mj_node_t::device_type device_t;
8450 #endif
8451 
8453 
8454  RCP<const Environment> mj_env; // the environment object
8455  RCP<const Comm<int> > mj_problemComm; // initial comm object
8456  RCP<const typename Adapter::base_adapter_t> mj_adapter; // coordinate adapter
8457 
8458  // PARAMETERS
8459  double imbalance_tolerance; // input imbalance tolerance.
8460 
8461  int num_teams; // how many teams to run main loop with
8462 
8463  size_t num_global_parts; // the targeted number of parts
8464 
8465  // input part array specifying num part to divide along each dim.
8466  Kokkos::View<mj_part_t*, Kokkos::HostSpace> part_no_array;
8467 
8468  // the number of steps that partitioning will be solved in.
8469  int recursion_depth;
8470 
8471  int coord_dim; // coordinate dimension.
8472  mj_lno_t num_local_coords; //number of local coords.
8473  mj_gno_t num_global_coords; //number of global coords.
8474 
8475  // initial global ids of the coordinates.
8476  Kokkos::View<const mj_gno_t*, device_t> initial_mj_gnos;
8477 
8478  // two dimension coordinate array.
8479  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8480  Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
8481  mj_coordinates;
8482 
8483  int num_weights_per_coord; // number of weights per coordinate
8484 
8485  // if the target parts are uniform.
8486  Kokkos::View<bool*, Kokkos::HostSpace> mj_uniform_weights;
8487 
8488  // two dimensional weight array.
8489  Kokkos::View<mj_scalar_t**, device_t> mj_weights;
8490 
8491  // if the target parts are uniform
8492  Kokkos::View<bool*, Kokkos::HostSpace> mj_uniform_parts;
8493 
8494  // Nonuniform first level partitioning
8495  // Currently used for Dragonfly task mapping by partitioning Dragonfly RCA
8496  // machine coordinates and application coordinates.
8497  // An optimization that completely partitions the most important machine
8498  // dimension first (i.e. the Dragonfly group coordinate, or RCA's x
8499  // coordinate). The standard MJ alg follows after the nonuniform first level
8500  // partitioning.
8501  // If used, number of parts for the first level partitioning
8502  mj_part_t num_first_level_parts;
8503 
8504  // If used, the distribution of parts for the nonuniform
8505  // first level partitioning
8506  Kokkos::View<mj_part_t*, Kokkos::HostSpace> first_level_distribution;
8507 
8508  // if partitioning can distribute points on same coordiante to
8509  // different parts.
8510  bool distribute_points_on_cut_lines;
8511 
8512  // how many parts we can calculate concurrently.
8513  mj_part_t max_concurrent_part_calculation;
8514 
8515  // whether to migrate=1, avoid migrate=2, or leave decision to MJ=0
8516  int check_migrate_avoid_migration_option;
8517 
8518  // when doing the migration, 0 will aim for perfect load-imbalance,
8519  int migration_type;
8520 
8521  // 1 for minimized messages
8522 
8523  // when MJ decides whether to migrate, the minimum imbalance for migration.
8524  double minimum_migration_imbalance;
8525  bool mj_keep_part_boxes; //if the boxes need to be kept.
8526 
8527  // if this is set, then recursion depth is adjusted to its maximum value.
8528  bool mj_run_as_rcb;
8529  int mj_premigration_option;
8530  int min_coord_per_rank_for_premigration;
8531 
8532  // communication graph xadj
8533  ArrayRCP<mj_part_t> comXAdj_;
8534 
8535  // communication graph adj.
8536  ArrayRCP<mj_part_t> comAdj_;
8537 
8538  void copy(
8539  const RCP<PartitioningSolution<Adapter> >&solution);
8540 
8541  void set_input_parameters(const Teuchos::ParameterList &p);
8542 
8543  RCP<mj_partBoxVector_t> getGlobalBoxBoundaries() const;
8544 
8545  bool mj_premigrate_to_subset(
8546  int used_num_ranks,
8547  int migration_selection_option,
8548  RCP<const Environment> mj_env_,
8549  RCP<const Comm<int> > mj_problemComm_,
8550  int coord_dim_,
8551  mj_lno_t num_local_coords_,
8552  mj_gno_t num_global_coords_, size_t num_global_parts_,
8553  Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos_,
8554  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8555  Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> &
8556  mj_coordinates_,
8557  int num_weights_per_coord_,
8558  Kokkos::View<mj_scalar_t**, device_t> & mj_weights_,
8559  //results
8560  RCP<const Comm<int> > &result_problemComm_,
8561  mj_lno_t & result_num_local_coords_,
8562  Kokkos::View<mj_gno_t*, device_t> & result_initial_mj_gnos_,
8563  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8564  Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> &
8565  result_mj_coordinates_,
8566  Kokkos::View<mj_scalar_t**, device_t> & result_mj_weights_,
8567  int * &result_actual_owner_rank_);
8568 
8569 public:
8570 
8571  Zoltan2_AlgMJ(const RCP<const Environment> &env,
8572  RCP<const Comm<int> > &problemComm,
8573  const RCP<const typename Adapter::base_adapter_t> &adapter) :
8574  mj_partitioner(),
8575  mj_env(env),
8576  mj_problemComm(problemComm),
8577  mj_adapter(adapter),
8578  imbalance_tolerance(0),
8579  num_teams(0),
8580  num_global_parts(1),
8581  recursion_depth(0),
8582  coord_dim(0),
8583  num_local_coords(0),
8584  num_global_coords(0),
8585  num_weights_per_coord(0),
8586  num_first_level_parts(1),
8587  distribute_points_on_cut_lines(true),
8588  max_concurrent_part_calculation(1),
8589  check_migrate_avoid_migration_option(0),
8590  migration_type(0),
8591  minimum_migration_imbalance(0.30),
8592  mj_keep_part_boxes(false),
8593  mj_run_as_rcb(false),
8594  mj_premigration_option(0),
8595  min_coord_per_rank_for_premigration(32000),
8596  comXAdj_(),
8597  comAdj_()
8598  {
8599  }
8600 
8602  {
8603  }
8604 
8607  static void getValidParameters(ParameterList & pl)
8608  {
8609  const bool bUnsorted = true; // this clarifies the flag is for unsrorted
8610  RCP<Zoltan2::IntegerRangeListValidator<int>> mj_parts_Validator =
8611  Teuchos::rcp( new Zoltan2::IntegerRangeListValidator<int>(bUnsorted) );
8612  pl.set("mj_parts", "0", "list of parts for multiJagged partitioning "
8613  "algorithm. As many as the dimension count.", mj_parts_Validator);
8614 
8615  pl.set("mj_concurrent_part_count", 1, "The number of parts whose cut "
8616  "coordinates will be calculated concurently.",
8618 
8619  pl.set("mj_minimum_migration_imbalance", 1.1,
8620  "mj_minimum_migration_imbalance, the minimum imbalance of the "
8621  "processors to avoid migration",
8623 
8624  RCP<Teuchos::EnhancedNumberValidator<int>> mj_migration_option_validator =
8625  Teuchos::rcp( new Teuchos::EnhancedNumberValidator<int>(0, 2) );
8626  pl.set("mj_migration_option", 1, "Migration option, 0 for decision "
8627  "depending on the imbalance, 1 for forcing migration, 2 for "
8628  "avoiding migration", mj_migration_option_validator);
8629 
8630  RCP<Teuchos::EnhancedNumberValidator<int>> mj_migration_type_validator =
8631  Teuchos::rcp( new Teuchos::EnhancedNumberValidator<int>(0, 1) );
8632  pl.set("mj_migration_type", 0,
8633  "Migration type, 0 for migration to minimize the imbalance "
8634  "1 for migration to minimize messages exchanged the migration.",
8635  mj_migration_option_validator);
8636 
8637  // bool parameter
8638  pl.set("mj_keep_part_boxes", false, "Keep the part boundaries of the "
8639  "geometric partitioning.", Environment::getBoolValidator());
8640 
8641  // bool parameter
8642  pl.set("mj_enable_rcb", false, "Use MJ as RCB.",
8644 
8645  pl.set("mj_recursion_depth", -1, "Recursion depth for MJ: Must be "
8646  "greater than 0.", Environment::getAnyIntValidator());
8647 
8648  RCP<Teuchos::EnhancedNumberValidator<int>>
8649  mj_num_teams_validator =
8650  Teuchos::rcp( new Teuchos::EnhancedNumberValidator<int>(
8651  0, Teuchos::EnhancedNumberTraits<int>::max()) );
8652  pl.set("mj_num_teams", 0,
8653  "How many teams for the main kernel loop"
8654  , mj_num_teams_validator);
8655 
8656  RCP<Teuchos::EnhancedNumberValidator<int>>
8657  mj_premigration_option_validator =
8658  Teuchos::rcp( new Teuchos::EnhancedNumberValidator<int>(0, 1024) );
8659 
8660  pl.set("mj_premigration_option", 0,
8661  "Whether to do premigration or not. 0 for no migration "
8662  "x > 0 for migration to consecutive processors, "
8663  "the subset will be 0,x,2x,3x,...subset ranks."
8664  , mj_premigration_option_validator);
8665 
8666  pl.set("mj_premigration_coordinate_count", 32000, "How many coordinate to "
8667  "assign each rank in multijagged after premigration"
8669  }
8670 
8676  void partition(const RCP<PartitioningSolution<Adapter> > &solution);
8677 
8678  mj_partBoxVector_t &getPartBoxesView() const
8679  {
8680  RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
8681  return *pBoxes;
8682  }
8683 
8684  mj_part_t pointAssign(int dim, adapter_scalar_t *point) const;
8685 
8686  void boxAssign(int dim, adapter_scalar_t *lower, adapter_scalar_t *upper,
8687  size_t &nPartsFound, mj_part_t **partsFound) const;
8688 
8691  void getCommunicationGraph(
8692  const PartitioningSolution<Adapter> *solution,
8693  ArrayRCP<mj_part_t> &comXAdj,
8694  ArrayRCP<mj_part_t> &comAdj);
8695 
8696  void set_up_partitioning_data( // public for CUDA
8697  const RCP<PartitioningSolution<Adapter> >&solution);
8698 
8699  private:
8700  std::string timer_base_string; // used for making timers
8701 
8702  // After loading views from coordinate adapter we may need to copy them
8703  // if mj type is different, but otherwise we just want to assign the view.
8704  // So purpose of this code is to make that assign only happen when the types
8705  // match. The empty case would otherwise not compile.
8706  // If they don't match the internal code handles allocating the new view
8707  // and copying the elements. See the test Zoltan2_mj_int_coordinates.
8708  template<class dst_t, class src_t> // version for same types
8709  typename std::enable_if<std::is_same<typename dst_t::value_type,
8710  typename src_t::value_type>::value>::type
8711  assign_if_same(dst_t & dst, const src_t & src) {
8712  dst = src;
8713  }
8714  template<class dst_t, class src_t> // version for different types
8715  typename std::enable_if<!std::is_same<typename dst_t::value_type,
8716  typename src_t::value_type>::value>::type
8717  assign_if_same(dst_t & dst, const src_t & src) {
8718  // do nothing - handled manually
8719  }
8720 };
8721 
8722 template <typename Adapter>
8723 bool Zoltan2_AlgMJ<Adapter>::mj_premigrate_to_subset(
8724  int used_num_ranks,
8725  int migration_selection_option,
8726  RCP<const Environment> mj_env_,
8727  RCP<const Comm<int> > mj_problemComm_,
8728  int coord_dim_,
8729  mj_lno_t num_local_coords_,
8730  mj_gno_t num_global_coords_, size_t num_global_parts_,
8731  Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos_,
8732  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8733  Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> & mj_coordinates_,
8734  int num_weights_per_coord_,
8735  Kokkos::View<mj_scalar_t**, device_t> & mj_weights_,
8736  //results
8737  RCP<const Comm<int> > & result_problemComm_,
8738  mj_lno_t &result_num_local_coords_,
8739  Kokkos::View<mj_gno_t*, device_t> & result_initial_mj_gnos_,
8740  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8741  Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> &
8742  result_mj_coordinates_,
8743  Kokkos::View<mj_scalar_t**, device_t> & result_mj_weights_,
8744  int * &result_actual_owner_rank_)
8745 {
8746  mj_env_->timerStart(MACRO_TIMERS,
8747  timer_base_string + "PreMigration DistributorPlanCreating");
8748 
8749  int myRank = mj_problemComm_->getRank();
8750  int worldSize = mj_problemComm_->getSize();
8751 
8752  mj_part_t groupsize = worldSize / used_num_ranks;
8753 
8754  std::vector<mj_part_t> group_begins(used_num_ranks + 1, 0);
8755 
8756  mj_part_t i_am_sending_to = 0;
8757  bool am_i_a_receiver = false;
8758 
8759  for(int i = 0; i < used_num_ranks; ++i) {
8760  group_begins[i+ 1] = group_begins[i] + groupsize;
8761  if(worldSize % used_num_ranks > i) group_begins[i+ 1] += 1;
8762  if(i == used_num_ranks) group_begins[i+ 1] = worldSize;
8763  if(myRank >= group_begins[i] && myRank < group_begins[i + 1]) {
8764  i_am_sending_to = group_begins[i];
8765  }
8766  if(myRank == group_begins[i]) {
8767  am_i_a_receiver = true;
8768  }
8769  }
8770 
8771  ArrayView<const mj_part_t> idView(&(group_begins[0]), used_num_ranks );
8772  result_problemComm_ = mj_problemComm_->createSubcommunicator(idView);
8773 
8774  Tpetra::Distributor distributor(mj_problemComm_);
8775 
8776  std::vector<mj_part_t>
8777  coordinate_destinations(num_local_coords_, i_am_sending_to);
8778 
8779  ArrayView<const mj_part_t>
8780  destinations(&(coordinate_destinations[0]), num_local_coords_);
8781  mj_lno_t num_incoming_gnos = distributor.createFromSends(destinations);
8782  result_num_local_coords_ = num_incoming_gnos;
8783  mj_env_->timerStop(MACRO_TIMERS,
8784  timer_base_string + "PreMigration DistributorPlanCreating");
8785 
8786  mj_env_->timerStart(MACRO_TIMERS,
8787  timer_base_string + "PreMigration DistributorMigration");
8788 
8789 
8790  // migrate gnos.
8791  // MPI buffers should be on Kokkos::HostSpace not Kokkos::CudaUVMSpace
8792  // Note, with UVM space, create_mirror_view does NOT create a non-UVM
8793  // view; need the explicit Host creation and deep_copy.
8794  {
8795  Kokkos::View<mj_gno_t*, Kokkos::HostSpace> sent_gnos(
8796  Kokkos::ViewAllocateWithoutInitializing("sent_gnos"),
8797  initial_mj_gnos_.size()); // initial_mj_gnos_ is const mj_gno_t *
8798  Kokkos::deep_copy(sent_gnos, initial_mj_gnos_);
8799 
8800  Kokkos::View<mj_gno_t*, Kokkos::HostSpace> received_gnos (
8801  Kokkos::ViewAllocateWithoutInitializing("received_gnos"),
8802  num_incoming_gnos);
8803 
8804  distributor.doPostsAndWaits(sent_gnos, 1, received_gnos);
8805 
8806  result_initial_mj_gnos_ = Kokkos::View<mj_gno_t*, device_t>(
8807  Kokkos::ViewAllocateWithoutInitializing("result_initial_mj_gnos_"),
8808  num_incoming_gnos);
8809  Kokkos::deep_copy(result_initial_mj_gnos_, received_gnos);
8810  }
8811 
8812  // migrate coordinates
8813  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8814 
8815  Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, Kokkos::HostSpace>
8816  host_src_coordinates(
8817  Kokkos::ViewAllocateWithoutInitializing("mj_coordinates"),
8818  this->mj_coordinates.extent(0), this->mj_coordinates.extent(1));
8819 
8820  Kokkos::deep_copy(host_src_coordinates, this->mj_coordinates);
8821 
8822  Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> dst_coordinates(
8823  Kokkos::ViewAllocateWithoutInitializing("mj_coordinates"),
8824  num_incoming_gnos, this->coord_dim);
8825 
8826  Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> received_coord(
8827  Kokkos::ViewAllocateWithoutInitializing("received_coord"),
8828  num_incoming_gnos);
8829 
8830  for(int i = 0; i < this->coord_dim; ++i) {
8831 
8832  auto sent_coord = Kokkos::subview(host_src_coordinates, Kokkos::ALL, i);
8833 
8834  distributor.doPostsAndWaits(sent_coord, 1, received_coord);
8835 
8836  Kokkos::deep_copy(Kokkos::subview(dst_coordinates, Kokkos::ALL, i),
8837  received_coord);
8838  Kokkos::fence();
8839  }
8840  result_mj_coordinates_ = dst_coordinates;
8841 
8842  // migrate weights.
8843 
8844  Kokkos::View<mj_scalar_t**, device_t> dst_weights(
8845  Kokkos::ViewAllocateWithoutInitializing("mj_weights"),
8846  num_incoming_gnos, this->num_weights_per_coord);
8847  auto host_dst_weights = Kokkos::create_mirror_view(dst_weights);
8848 
8849  auto host_src_weights = Kokkos::create_mirror_view_and_copy(
8850  Kokkos::HostSpace(), this->mj_weights);
8851 
8852  // contiguous buffers to gather potentially strided data
8853  Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> sent_weight(
8854  Kokkos::ViewAllocateWithoutInitializing("send_weight_buffer"),
8855  this->num_local_coords);
8856 
8857  Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> received_weight(
8858  Kokkos::ViewAllocateWithoutInitializing("received_weight_buffer"),
8859  num_incoming_gnos);
8860 
8861  for(int i = 0; i < this->num_weights_per_coord; ++i) {
8862 
8863  auto sub_host_src_weights
8864  = Kokkos::subview(host_src_weights, Kokkos::ALL, i);
8865  auto sub_host_dst_weights
8866  = Kokkos::subview(host_dst_weights, Kokkos::ALL, i);
8867 
8868  // Layout Right means these weights are not contiguous
8869  // However we don't have any systems setup with more than 1 weight so
8870  // really I have not tested any of this code with num weights > 1.
8871  // I think this is the right thing to do. Note that there are other
8872  // places in the code which don't handle the possibility of more weights.
8873  // So evaluating all that and adding tests would be another project.
8874  for(mj_lno_t n = 0; n < this->num_local_coords; ++n) {
8875  sent_weight[n] = sub_host_src_weights(n);
8876  }
8877 
8878  distributor.doPostsAndWaits(sent_weight, 1, received_weight);
8879 
8880  // Again we copy by index due to layout
8881  for(mj_lno_t n = 0; n < num_incoming_gnos; ++n) {
8882  sub_host_dst_weights(n) = received_weight[n];
8883  }
8884  }
8885  Kokkos::deep_copy(dst_weights, host_dst_weights);
8886  result_mj_weights_ = dst_weights;
8887 
8888  // migrate the owners of the coordinates
8889  {
8890  Kokkos::View<int*, Kokkos::HostSpace> sent_owners(
8891  Kokkos::ViewAllocateWithoutInitializing("sent_owners"),
8892  num_local_coords_);
8893  Kokkos::deep_copy(sent_owners, myRank);
8894 
8895  Kokkos::View<int*, Kokkos::HostSpace> received_owners(
8896  Kokkos::ViewAllocateWithoutInitializing("received_owners"),
8897  num_incoming_gnos);
8898 
8899  distributor.doPostsAndWaits(sent_owners, 1, received_owners);
8900 
8901  result_actual_owner_rank_ = new int[num_incoming_gnos];
8902  memcpy(
8903  result_actual_owner_rank_,
8904  received_owners.data(),
8905  num_incoming_gnos * sizeof(int));
8906  }
8907 
8908  mj_env_->timerStop(MACRO_TIMERS,
8909  timer_base_string + "PreMigration DistributorMigration");
8910  return am_i_a_receiver;
8911 }
8912 
8920 template <typename Adapter>
8922  const RCP<PartitioningSolution<Adapter> > &solution)
8923 {
8924  // purpose of this code is to validate node and UVM status for the tests
8925  // std::cout << "Memory Space: " << mj_node_t::memory_space::name() << " "
8926  // << "Execution Space: " << mj_node_t::execution_space::name()
8927  // << std::endl;
8928 
8929  int execute_counter =
8931  timer_base_string = "partition(" + std::to_string(execute_counter) + ") - ";
8932 
8933  this->mj_env->timerStart(MACRO_TIMERS, timer_base_string + "all");
8934  {
8935  this->mj_env->timerStart(MACRO_TIMERS, timer_base_string + "setup");
8936 
8937  this->set_up_partitioning_data(solution);
8938 
8939  this->set_input_parameters(this->mj_env->getParameters());
8940  if(this->mj_keep_part_boxes) {
8941  this->mj_partitioner.set_to_keep_part_boxes();
8942  }
8943 
8944  this->mj_partitioner.set_partitioning_parameters(
8945  this->distribute_points_on_cut_lines,
8946  this->max_concurrent_part_calculation,
8947  this->check_migrate_avoid_migration_option,
8948  this->minimum_migration_imbalance, this->migration_type);
8949 
8950  RCP<const Comm<int> > result_problemComm = this->mj_problemComm;
8951  mj_lno_t result_num_local_coords = this->num_local_coords;
8952  Kokkos::View<mj_gno_t*, device_t> result_initial_mj_gnos;
8953  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8954  Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
8955  result_mj_coordinates = this->mj_coordinates;
8956  Kokkos::View<mj_scalar_t**, device_t> result_mj_weights =
8957  this->mj_weights;
8958  int *result_actual_owner_rank = NULL;
8959 
8960  Kokkos::View<const mj_gno_t*, device_t> result_initial_mj_gnos_ =
8961  this->initial_mj_gnos;
8962 
8963  // TODO: MD 08/2017: Further discussion is required.
8964  // MueLu calls MJ when it has very few coordinates per processors,
8965  // such as 10. For example, it begins with 1K processor with 1K coordinate
8966  // in each. Then with coarsening this reduces to 10 coordinate per procesor.
8967  // It calls MJ to repartition these to 10 coordinates.
8968  // MJ runs with 1K processor, 10 coordinate in each, and partitions to
8969  // 10 parts. As expected strong scaling is problem here, because
8970  // computation is almost 0, and communication cost of MJ linearly increases.
8971  // Premigration option gathers the coordinates to 10 parts before MJ starts
8972  // therefore MJ will run with a smalller subset of the problem.
8973  // Below, I am migrating the coordinates if mj_premigration_option is set,
8974  // and the result parts are less than the current part count, and the
8975  // average number of local coordinates is less than some threshold.
8976  // For example, premigration may not help if 1000 processors are
8977  // partitioning data to 10, but each of them already have 1M coordinate.
8978  // In that case, we premigration would not help.
8979  int current_world_size = this->mj_problemComm->getSize();
8980  mj_lno_t threshold_num_local_coords =
8981  this->min_coord_per_rank_for_premigration;
8982  bool is_pre_migrated = false;
8983  bool am_i_in_subset = true;
8984 
8985  // Note that we need to add testing for migration and should also cover the
8986  // zoltan case when ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION is defined.
8987  // Currently did a minimal test of this code by running mjTest with
8988  // PM=1, TB=0 then run again with C=3 instead of C=4 (numProcs is 4).
8989  if(mj_premigration_option > 0 &&
8990  size_t (current_world_size) > this->num_global_parts &&
8991  this->num_global_coords < mj_gno_t (
8992  current_world_size * threshold_num_local_coords))
8993  {
8994  if(this->mj_keep_part_boxes) {
8995  throw std::logic_error("Multijagged: mj_keep_part_boxes and "
8996  "mj_premigration_option are not supported together yet.");
8997  }
8998 
8999  is_pre_migrated =true;
9000  int migration_selection_option = mj_premigration_option;
9001  if(migration_selection_option * this->num_global_parts >
9002  (size_t) (current_world_size)) {
9003  migration_selection_option =
9004  current_world_size / this->num_global_parts;
9005  }
9006 
9007  int used_num_ranks = int (this->num_global_coords /
9008  float (threshold_num_local_coords) + 0.5);
9009 
9010  if(used_num_ranks == 0) {
9011  used_num_ranks = 1;
9012  }
9013 
9014  am_i_in_subset = this->mj_premigrate_to_subset(
9015  used_num_ranks,
9016  migration_selection_option,
9017  this->mj_env,
9018  this->mj_problemComm,
9019  this->coord_dim,
9020  this->num_local_coords,
9021  this->num_global_coords,
9022  this->num_global_parts,
9023  this->initial_mj_gnos,
9024  this->mj_coordinates,
9025  this->num_weights_per_coord,
9026  this->mj_weights,
9027  //results
9028  result_problemComm,
9029  result_num_local_coords,
9030  result_initial_mj_gnos,
9031  result_mj_coordinates,
9032  result_mj_weights,
9033  result_actual_owner_rank);
9034 
9035  result_initial_mj_gnos_ = result_initial_mj_gnos;
9036  }
9037 
9038  Kokkos::View<mj_part_t *, device_t> result_assigned_part_ids;
9039  Kokkos::View<mj_gno_t*, device_t> result_mj_gnos;
9040 
9041  this->mj_env->timerStop(MACRO_TIMERS, timer_base_string + "setup");
9042 
9043  if(am_i_in_subset) {
9044  this->mj_partitioner.multi_jagged_part(
9045  this->mj_env,
9046  result_problemComm, //this->mj_problemComm,
9047  this->imbalance_tolerance,
9048  this->num_teams,
9049  this->num_global_parts,
9050  this->part_no_array,
9051  this->recursion_depth,
9052  this->coord_dim,
9053  result_num_local_coords, //this->num_local_coords,
9054  this->num_global_coords,
9055  result_initial_mj_gnos_,
9056  result_mj_coordinates,
9057  this->num_weights_per_coord,
9058  this->mj_uniform_weights,
9059  result_mj_weights,
9060  this->mj_uniform_parts,
9061  result_assigned_part_ids,
9062  result_mj_gnos
9063  );
9064  }
9065 
9066  this->mj_env->timerStart(MACRO_TIMERS, timer_base_string + "cleanup");
9067 
9068  // Reorder results so that they match the order of the input
9069  std::unordered_map<mj_gno_t, mj_lno_t> localGidToLid;
9070  localGidToLid.reserve(result_num_local_coords);
9071  Kokkos::View<mj_gno_t*, Kokkos::HostSpace> host_result_initial_mj_gnos(
9072  Kokkos::ViewAllocateWithoutInitializing("host_result_initial_mj_gnos"),
9073  result_initial_mj_gnos_.size());
9074  Kokkos::deep_copy(host_result_initial_mj_gnos, result_initial_mj_gnos_);
9075  for(mj_lno_t i = 0; i < result_num_local_coords; i++) {
9076  localGidToLid[host_result_initial_mj_gnos(i)] = i;
9077  }
9078 
9079  ArrayRCP<mj_part_t> partId = arcp(new mj_part_t[result_num_local_coords],
9080  0, result_num_local_coords, true);
9081  auto host_result_assigned_part_ids =
9082  Kokkos::create_mirror_view(result_assigned_part_ids);
9083  Kokkos::deep_copy(host_result_assigned_part_ids, result_assigned_part_ids);
9084  auto host_result_mj_gnos = Kokkos::create_mirror_view(result_mj_gnos);
9085  Kokkos::deep_copy(host_result_mj_gnos, result_mj_gnos);
9086  for(mj_lno_t i = 0; i < result_num_local_coords; i++) {
9087  mj_lno_t origLID = localGidToLid[host_result_mj_gnos(i)];
9088  partId[origLID] = host_result_assigned_part_ids(i);
9089  }
9090 
9091  //now the results are reordered. but if premigration occured,
9092  //then we need to send these ids to actual owners again.
9093  if(is_pre_migrated) {
9094  this->mj_env->timerStart(MACRO_TIMERS, timer_base_string +
9095  "PostMigration DistributorPlanCreating");
9096  Tpetra::Distributor distributor(this->mj_problemComm);
9097 
9098  ArrayView<const mj_part_t> actual_owner_destinations(
9099  result_actual_owner_rank , result_num_local_coords);
9100 
9101  mj_lno_t num_incoming_gnos = distributor.createFromSends(
9102  actual_owner_destinations);
9103 
9104  if(num_incoming_gnos != this->num_local_coords) {
9105  throw std::logic_error("Zoltan2 - Multijagged Post Migration - "
9106  "num incoming is not equal to num local coords");
9107  }
9108 
9109  mj_env->timerStop(MACRO_TIMERS, timer_base_string +
9110  "PostMigration DistributorPlanCreating");
9111  mj_env->timerStart(MACRO_TIMERS, timer_base_string +
9112  "PostMigration DistributorMigration");
9113 
9114  Kokkos::View<mj_gno_t*, Kokkos::HostSpace> received_gnos(
9115  Kokkos::ViewAllocateWithoutInitializing("received_gnos"),
9116  num_incoming_gnos);
9117  Kokkos::View<mj_part_t*, Kokkos::HostSpace> received_partids(
9118  Kokkos::ViewAllocateWithoutInitializing("received_partids"),
9119  num_incoming_gnos);
9120 
9121  distributor.doPostsAndWaits(host_result_initial_mj_gnos, 1,
9122  received_gnos);
9123  {
9124  Kokkos::View<mj_part_t*, Kokkos::HostSpace> sent_partnos;
9125  if (partId.size() > 0) {
9126  sent_partnos = Kokkos::View<mj_part_t*, Kokkos::HostSpace>(
9127  partId.getRawPtr(), partId.size()); //unmanaged
9128  }
9129  distributor.doPostsAndWaits(sent_partnos, 1, received_partids);
9130  }
9131 
9132  partId = arcp(new mj_part_t[this->num_local_coords],
9133  0, this->num_local_coords, true);
9134 
9135  {
9136  std::unordered_map<mj_gno_t, mj_lno_t> localGidToLid2;
9137  localGidToLid2.reserve(this->num_local_coords);
9138  auto host_initial_mj_gnos =
9139  Kokkos::create_mirror_view(this->initial_mj_gnos);
9140  Kokkos::deep_copy(host_initial_mj_gnos,
9141  this->initial_mj_gnos);
9142  for(mj_lno_t i = 0; i < this->num_local_coords; i++) {
9143  localGidToLid2[host_initial_mj_gnos(i)] = i;
9144  }
9145 
9146  for(mj_lno_t i = 0; i < this->num_local_coords; i++) {
9147  mj_lno_t origLID = localGidToLid2[received_gnos[i]];
9148  partId[origLID] = received_partids[i];
9149  }
9150  }
9151 
9152  {
9153  delete [] result_actual_owner_rank;
9154  }
9155  mj_env->timerStop(MACRO_TIMERS,
9156  timer_base_string + "PostMigration DistributorMigration");
9157  }
9158  solution->setParts(partId);
9159  this->mj_env->timerStop(MACRO_TIMERS, timer_base_string + "cleanup");
9160  }
9161 
9162  this->mj_env->timerStop(MACRO_TIMERS, timer_base_string + "all");
9163 
9164  // reset the view (release the reference to device data)
9165  this->mj_coordinates = Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t>();
9166 }
9167 
9168 /* \brief Sets the partitioning data for multijagged algorithm.
9169  * */
9170 template <typename Adapter>
9172  const RCP<PartitioningSolution<Adapter> > &solution
9173 )
9174 {
9175  modelFlag_t flags;
9176  CoordinateModel<Adapter> mj_coords(mj_adapter, mj_env, mj_problemComm, flags);
9177 
9178  this->coord_dim = mj_coords.getCoordinateDim();
9179  this->num_weights_per_coord = mj_coords.getNumWeightsPerCoordinate();
9180  this->num_local_coords = mj_coords.getLocalNumCoordinates();
9181  this->num_global_coords = mj_coords.getGlobalNumCoordinates();
9182 
9183  int criteria_dim = (this->num_weights_per_coord ?
9184  this->num_weights_per_coord : 1);
9185  // From the Solution we get part information.
9186  // If the part sizes for a given criteria are not uniform,
9187  // then they are values that sum to 1.0.
9188  this->num_global_parts = solution->getTargetGlobalNumberOfParts();
9189  // allocate only two dimensional pointer.
9190  // raw pointer addresess will be obtained from multivector.
9191  this->mj_uniform_parts = Kokkos::View<bool *, Kokkos::HostSpace>(
9192  "uniform parts", criteria_dim);
9193  this->mj_uniform_weights = Kokkos::View<bool *, Kokkos::HostSpace>(
9194  "uniform weights", criteria_dim);
9195 
9196  Kokkos::View<const mj_gno_t *, device_t> gnos;
9197  Kokkos::View<adapter_scalar_t **, Kokkos::LayoutLeft, device_t> xyz_adapter;
9198  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
9199  Kokkos::View<adapter_scalar_t **, device_t> wgts_adapter;
9200  mj_coords.getCoordinatesKokkos(gnos, xyz_adapter, wgts_adapter);
9201  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
9202  Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t> xyz;
9203  Kokkos::View<mj_scalar_t **, device_t> wgts;
9204 
9205  // Now we must get the data from the adapter.
9206  // If the types match we point to the view but if not, we must copy.
9207  if(std::is_same<mj_scalar_t, adapter_scalar_t>()) {
9208  // we can just point the views but we must specialize because this code
9209  // only compiles in this case - for is_same false assign does nothing.
9210  assign_if_same(xyz, xyz_adapter);
9211  assign_if_same(wgts, wgts_adapter);
9212  }
9213  else {
9214  // we only allocate a new view if we are going to copy
9215  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
9216  xyz = Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t>
9217  (Kokkos::ViewAllocateWithoutInitializing(
9218  "xyz"), xyz_adapter.extent(0), xyz_adapter.extent(1));
9219  wgts = Kokkos::View<mj_scalar_t **, device_t>(
9220  Kokkos::ViewAllocateWithoutInitializing("wgts"),
9221  wgts_adapter.extent(0), wgts_adapter.extent(1));
9222 
9223  typedef typename Kokkos::View<mj_scalar_t **, device_t>::size_type view_size_t;
9224  Kokkos::parallel_for(
9225  Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
9226  (0, xyz_adapter.extent(0)), KOKKOS_LAMBDA (int i) {
9227  for(view_size_t n = 0; n < xyz_adapter.extent(1); ++n) {
9228  xyz(i, n) = static_cast<mj_scalar_t>(xyz_adapter(i, n));
9229  }
9230  });
9231  Kokkos::parallel_for(
9232  Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
9233  (0, wgts.extent(0)), KOKKOS_LAMBDA (int i) {
9234  for(view_size_t n = 0; n < wgts.extent(1); ++n) {
9235  wgts(i, n) = static_cast<mj_scalar_t>(wgts_adapter(i, n));
9236  }
9237  });
9238  }
9239 
9240  // obtain global ids.
9241  this->initial_mj_gnos = gnos;
9242  // extract coordinates from multivector.
9243  this->mj_coordinates = xyz;
9244  // if no weights are provided set uniform weight.
9245 
9246  if(this->num_weights_per_coord == 0) {
9247  this->mj_uniform_weights(0) = true;
9248  Kokkos::resize(this->mj_weights, 0, 0);
9249  }
9250  else{
9251  this->mj_weights = wgts;
9252  for(int wdim = 0; wdim < this->num_weights_per_coord; ++wdim) {
9253  this->mj_uniform_weights(wdim) = false;
9254  }
9255  }
9256 
9257  for(int wdim = 0; wdim < criteria_dim; ++wdim) {
9258  if(solution->criteriaHasUniformPartSizes(wdim)) {
9259  this->mj_uniform_parts(wdim) = true;
9260  }
9261  else {
9262  printf("Error: MJ does not support non uniform target part weights\n");
9263  std::terminate();
9264  }
9265  }
9266 }
9267 
9268 /* \brief Sets the partitioning parameters for multijagged algorithm.
9269  * \param pl: is the parameter list provided to zoltan2 call
9270  * */
9271 template <typename Adapter>
9273  const Teuchos::ParameterList &pl)
9274 {
9275  const Teuchos::ParameterEntry *pe = pl.getEntryPtr("imbalance_tolerance");
9276  if(pe) {
9277  double tol;
9278  tol = pe->getValue(&tol);
9279  this->imbalance_tolerance = tol - 1.0;
9280  }
9281 
9282  // TODO: May be a more relaxed tolerance is needed. RCB uses 10%
9283  if(this->imbalance_tolerance <= 0) {
9284  this->imbalance_tolerance= 10e-4;
9285  }
9286 
9287  // if an input partitioning array is provided.
9288  Kokkos::resize(this->part_no_array, 0);
9289 
9290  // the length of the input partitioning array.
9291  this->recursion_depth = 0;
9292 
9293  if(pl.getPtr<int>("mj_num_teams")) {
9294  this->num_teams = pl.get<int>("mj_num_teams");
9295  }
9296 
9297  if(pl.getPtr<Array <mj_part_t> >("mj_parts")) {
9298  auto mj_parts = pl.get<Array <mj_part_t> >("mj_parts");
9299  int mj_parts_size = static_cast<int>(mj_parts.size());
9300 
9301  // build the view we'll have data on and copy values from host
9302  this->part_no_array = Kokkos::View<mj_part_t*, Kokkos::HostSpace>(
9303  "part_no_array", mj_parts_size);
9304  for(int i = 0; i < mj_parts_size; ++i) {
9305  this->part_no_array(i) = mj_parts.getRawPtr()[i];
9306  }
9307 
9308  this->recursion_depth = mj_parts_size - 1;
9309  this->mj_env->debug(2, "mj_parts provided by user");
9310  }
9311 
9312  // get mj specific parameters.
9313  this->distribute_points_on_cut_lines = true;
9314  this->max_concurrent_part_calculation = 1;
9315 
9316  this->mj_run_as_rcb = false;
9317  this->mj_premigration_option = 0;
9318  this->min_coord_per_rank_for_premigration = 32000;
9319 
9320  int mj_user_recursion_depth = -1;
9321  this->mj_keep_part_boxes = false;
9322  this->check_migrate_avoid_migration_option = 0;
9323  this->migration_type = 0;
9324  this->minimum_migration_imbalance = 0.35;
9325 
9326  pe = pl.getEntryPtr("mj_minimum_migration_imbalance");
9327  if(pe) {
9328  double imb;
9329  imb = pe->getValue(&imb);
9330  this->minimum_migration_imbalance = imb - 1.0;
9331  }
9332 
9333  pe = pl.getEntryPtr("mj_migration_option");
9334  if(pe) {
9335  this->check_migrate_avoid_migration_option =
9336  pe->getValue(&this->check_migrate_avoid_migration_option);
9337  } else {
9338  this->check_migrate_avoid_migration_option = 0;
9339  }
9340  if(this->check_migrate_avoid_migration_option > 1) {
9341  this->check_migrate_avoid_migration_option = -1;
9342  }
9343 
9345  pe = pl.getEntryPtr("mj_migration_type");
9346  if(pe) {
9347  this->migration_type = pe->getValue(&this->migration_type);
9348  } else {
9349  this->migration_type = 0;
9350  }
9351 
9352  //std::cout << " this->migration_type:" << this->migration_type << std::endl;
9354 
9355  pe = pl.getEntryPtr("mj_concurrent_part_count");
9356  if(pe) {
9357  this->max_concurrent_part_calculation =
9358  pe->getValue(&this->max_concurrent_part_calculation);
9359  } else {
9360  this->max_concurrent_part_calculation = 1; // Set to 1 if not provided.
9361  }
9362 
9363  pe = pl.getEntryPtr("mj_keep_part_boxes");
9364  if(pe) {
9365  this->mj_keep_part_boxes = pe->getValue(&this->mj_keep_part_boxes);
9366  } else {
9367  this->mj_keep_part_boxes = false; // Set to invalid value
9368  }
9369 
9370  // For now, need keep_part_boxes to do pointAssign and boxAssign.
9371  // pe = pl.getEntryPtr("keep_cuts");
9372  // if(pe) {
9373  // int tmp = pe->getValue(&tmp);
9374  // if(tmp) this->mj_keep_part_boxes = true;
9375  // }
9376 
9377  //need to keep part boxes if mapping type is geometric.
9378  if(this->mj_keep_part_boxes == false) {
9379  pe = pl.getEntryPtr("mapping_type");
9380  if(pe) {
9381  int mapping_type = -1;
9382  mapping_type = pe->getValue(&mapping_type);
9383  if(mapping_type == 0) {
9384  mj_keep_part_boxes = true;
9385  }
9386  }
9387  }
9388 
9389  // need to keep part boxes if mapping type is geometric.
9390  pe = pl.getEntryPtr("mj_enable_rcb");
9391  if(pe) {
9392  this->mj_run_as_rcb = pe->getValue(&this->mj_run_as_rcb);
9393  } else {
9394  this->mj_run_as_rcb = false; // Set to invalid value
9395  }
9396 
9397  pe = pl.getEntryPtr("mj_premigration_option");
9398  if(pe) {
9399  mj_premigration_option = pe->getValue(&mj_premigration_option);
9400  } else {
9401  mj_premigration_option = 0;
9402  }
9403 
9404  pe = pl.getEntryPtr("mj_premigration_coordinate_count");
9405  if(pe) {
9406  min_coord_per_rank_for_premigration = pe->getValue(&mj_premigration_option);
9407  } else {
9408  min_coord_per_rank_for_premigration = 32000;
9409  }
9410 
9411  pe = pl.getEntryPtr("mj_recursion_depth");
9412  if(pe) {
9413  mj_user_recursion_depth = pe->getValue(&mj_user_recursion_depth);
9414  } else {
9415  mj_user_recursion_depth = -1; // Set to invalid value
9416  }
9417 
9418  bool val = false;
9419  pe = pl.getEntryPtr("rectilinear");
9420  if(pe) {
9421  val = pe->getValue(&val);
9422  }
9423  if(val) {
9424  this->distribute_points_on_cut_lines = false;
9425  } else {
9426  this->distribute_points_on_cut_lines = true;
9427  }
9428 
9429  if(this->mj_run_as_rcb) {
9430  mj_user_recursion_depth =
9431  (int)(ceil(log ((this->num_global_parts)) / log (2.0)));
9432  }
9433  if(this->recursion_depth < 1) {
9434  if(mj_user_recursion_depth > 0) {
9435  this->recursion_depth = mj_user_recursion_depth;
9436  }
9437  else {
9438  this->recursion_depth = this->coord_dim;
9439  }
9440  }
9441 }
9442 
9444 template <typename Adapter>
9446  int dim,
9447  adapter_scalar_t *lower,
9448  adapter_scalar_t *upper,
9449  size_t &nPartsFound,
9450  typename Adapter::part_t **partsFound) const
9451 {
9452  // TODO: Implement with cuts rather than boxes to reduce algorithmic
9453  // TODO: complexity. Or at least do a search through the boxes, using
9454  // TODO: p x q x r x ... if possible.
9455 
9456  nPartsFound = 0;
9457  *partsFound = NULL;
9458 
9459  if(this->mj_keep_part_boxes) {
9460 
9461  // Get vector of part boxes
9462  RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
9463 
9464  size_t nBoxes = (*partBoxes).size();
9465  if(nBoxes == 0) {
9466  throw std::logic_error("no part boxes exist");
9467  }
9468 
9469  // Determine whether the box overlaps the globalBox at all
9470  RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
9471 
9472  if(globalBox->boxesOverlap(dim, lower, upper)) {
9473 
9474  std::vector<typename Adapter::part_t> partlist;
9475 
9476  // box overlaps the global box; find specific overlapping boxes
9477  for(size_t i = 0; i < nBoxes; i++) {
9478  try {
9479  if((*partBoxes)[i].boxesOverlap(dim, lower, upper)) {
9480  nPartsFound++;
9481  partlist.push_back((*partBoxes)[i].getpId());
9482  /*
9483  std::cout << "Given box (";
9484  for(int j = 0; j < dim; j++)
9485  std::cout << lower[j] << " ";
9486  std::cout << ") x (";
9487  for(int j = 0; j < dim; j++)
9488  std::cout << upper[j] << " ";
9489  std::cout << ") overlaps PartBox "
9490  << (*partBoxes)[i].getpId() << " (";
9491  for(int j = 0; j < dim; j++)
9492  std::cout << (*partBoxes)[i].getlmins()[j] << " ";
9493  std::cout << ") x (";
9494  for(int j = 0; j < dim; j++)
9495  std::cout << (*partBoxes)[i].getlmaxs()[j] << " ";
9496  std::cout << ")" << std::endl;
9497  */
9498  }
9499  }
9501  }
9502  if(nPartsFound) {
9503  *partsFound = new mj_part_t[nPartsFound];
9504  for(size_t i = 0; i < nPartsFound; i++)
9505  (*partsFound)[i] = partlist[i];
9506  }
9507  }
9508  else {
9509  // Box does not overlap the domain at all. Find the closest part
9510  // Not sure how to perform this operation for MJ without having the
9511  // cuts. With the RCB cuts, the concept of a part extending to
9512  // infinity was natural. With the boxes, it is much more difficult.
9513  // TODO: For now, return information indicating NO OVERLAP.
9514  }
9515  }
9516  else {
9517  throw std::logic_error("need to use keep_cuts parameter for boxAssign");
9518  }
9519 }
9520 
9522 template <typename Adapter>
9524  int dim,
9525  adapter_scalar_t *point) const
9526 {
9527  // TODO: Implement with cuts rather than boxes to reduce algorithmic
9528  // TODO: complexity. Or at least do a search through the boxes, using
9529  // TODO: p x q x r x ... if possible.
9530 
9531  if(this->mj_keep_part_boxes) {
9532  typename Adapter::part_t foundPart = -1;
9533 
9534  // Get vector of part boxes
9535  RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
9536 
9537  size_t nBoxes = (*partBoxes).size();
9538  if(nBoxes == 0) {
9539  throw std::logic_error("no part boxes exist");
9540  }
9541 
9542  // Determine whether the point is within the global domain
9543  RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
9544 
9545  if(globalBox->pointInBox(dim, point)) {
9546 
9547  // point is in the global domain; determine in which part it is.
9548  size_t i;
9549  for(i = 0; i < nBoxes; i++) {
9550  try {
9551  if((*partBoxes)[i].pointInBox(dim, point)) {
9552  foundPart = (*partBoxes)[i].getpId();
9553  // std::cout << "Point (";
9554  // for(int j = 0; j < dim; j++) std::cout << point[j] << " ";
9555  // std::cout << ") found in box " << i << " part " << foundPart
9556  // << std::endl;
9557  // (*partBoxes)[i].print();
9558  break;
9559  }
9560  }
9562  }
9563 
9564  if(i == nBoxes) {
9565  // This error should never occur
9566  std::ostringstream oss;
9567  oss << "Point (";
9568  for(int j = 0; j < dim; j++) oss << point[j] << " ";
9569  oss << ") not found in domain";
9570  throw std::logic_error(oss.str());
9571  }
9572  }
9573 
9574  else {
9575  // Point is outside the global domain.
9576  // Determine to which part it is closest.
9577  // TODO: with cuts, would not need this special case
9578 
9579  typedef typename Zoltan2::coordinateModelPartBox::coord_t coord_t;
9580  size_t closestBox = 0;
9581  coord_t minDistance = std::numeric_limits<coord_t>::max();
9582  coord_t *centroid = new coord_t[dim];
9583  for(size_t i = 0; i < nBoxes; i++) {
9584  (*partBoxes)[i].computeCentroid(centroid);
9585  coord_t sum = 0.;
9586  coord_t diff;
9587  for(int j = 0; j < dim; j++) {
9588  diff = centroid[j] - point[j];
9589  sum += diff * diff;
9590  }
9591  if(sum < minDistance) {
9592  minDistance = sum;
9593  closestBox = i;
9594  }
9595  }
9596  foundPart = (*partBoxes)[closestBox].getpId();
9597  delete [] centroid;
9598  }
9599 
9600  return foundPart;
9601  }
9602  else {
9603  throw std::logic_error("need to use keep_cuts parameter for pointAssign");
9604  }
9605 }
9606 
9607 template <typename Adapter>
9609  const PartitioningSolution<Adapter> *solution,
9610  ArrayRCP<typename Zoltan2_AlgMJ<Adapter>::mj_part_t> &comXAdj,
9611  ArrayRCP<typename Zoltan2_AlgMJ<Adapter>::mj_part_t> &comAdj)
9612 {
9613  if(comXAdj_.getRawPtr() == NULL && comAdj_.getRawPtr() == NULL) {
9614  RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
9615  mj_part_t ntasks = (*pBoxes).size();
9616  int dim = (*pBoxes)[0].getDim();
9617  GridHash grid(pBoxes, ntasks, dim);
9618  grid.getAdjArrays(comXAdj_, comAdj_);
9619  }
9620  comAdj = comAdj_;
9621  comXAdj = comXAdj_;
9622 }
9623 
9624 template <typename Adapter>
9625 RCP<typename Zoltan2_AlgMJ<Adapter>::mj_partBoxVector_t>
9627 {
9628  return this->mj_partitioner.get_kept_boxes();
9629 }
9630 } // namespace Zoltan2
9631 
9632 #endif
Zoltan2_MJArrayType< scalar_t > value_type
KOKKOS_INLINE_FUNCTION Zoltan2_MJArrayType()
KOKKOS_INLINE_FUNCTION void init(value_type dst) const
Kokkos::View< index_t *, device_t > part_xadj
GridHash Class, Hashing Class for part boxes.
void set_up_partitioning_data(const RCP< PartitioningSolution< Adapter > > &solution)
Time an algorithm (or other entity) as a whole.
global_size_t getGlobalNumCoordinates() const
Returns the global number coordinates.
void set(IT index_, CT count_, WT *vals_)
KOKKOS_INLINE_FUNCTION void join(value_type &dst, const value_type &src) const
#define Z2_FORWARD_EXCEPTIONS
Forward an exception back through call stack.
KOKKOS_INLINE_FUNCTION void operator()(const member_type &teamMember, value_type teamSum) const
Kokkos::View< index_t *, device_t > track_on_cuts
Defines Parameter related enumerators, declares functions.
KOKKOS_INLINE_FUNCTION ArrayReducer(value_type &val, int mj_value_count)
static RCP< Teuchos::BoolParameterEntryValidator > getBoolValidator()
Exists to make setting up validators less cluttered.
KOKKOS_INLINE_FUNCTION void join(value_type dst, const value_type src) const
void getAdjArrays(ArrayRCP< part_t > &comXAdj_, ArrayRCP< part_t > &comAdj_)
GridHash Class, returns the adj arrays.
Zoltan2_AlgMJ(const RCP< const Environment > &env, RCP< const Comm< int > > &problemComm, const RCP< const typename Adapter::base_adapter_t > &adapter)
Zoltan2_BoxBoundaries(Ordinal s_)
Constructor.
Kokkos::View< scalar_t *, device_t > coordinates
Sort items for quick sort function.
typename node_t::device_type device_t
std::bitset< NUM_MODEL_FLAGS > modelFlag_t
void uqSignsort(IT n, uSignedSortItem< IT, WT, SIGN > *arr)
Quick sort function. Sorts the arr of uSignedSortItems, with respect to increasing vals...
Kokkos::View< index_t *, device_t > permutations
map_t::global_ordinal_type gno_t
Definition: mapRemotes.cpp:18
typename Zoltan2::InputTraits< ztcrsmatrix_t >::node_t node_t
Class for sorting items with multiple values. First sorting with respect to val[0], then val[1] then ... val[count-1]. The last tie breaking is done with index values. Used for task mapping partitioning where the points on a cut line needs to be distributed consistently.
static RCP< Teuchos::AnyNumberParameterEntryValidator > getAnyDoubleValidator()
Exists to make setting up validators less cluttered.
Kokkos::View< scalar_t **, device_t > weights
KOKKOS_INLINE_FUNCTION void join(value_type dst, const value_type src) const
void partition(const RCP< PartitioningSolution< Adapter > > &solution)
Multi Jagged coordinate partitioning algorithm.
void reduce(const Ordinal count, const T inBuffer[], T inoutBuffer[]) const
Implement Teuchos::ValueTypeReductionOp interface.
KOKKOS_INLINE_FUNCTION value_type & reference() const
Kokkos::View< scalar_t * > scalar_view_t
coordinateModelPartBox Class, represents the boundaries of the box which is a result of a geometric p...
KOKKOS_INLINE_FUNCTION void init(value_type &dst) const
void set_to_keep_part_boxes()
Function call, if the part boxes are intended to be kept.
A ParameterList validator for integer range lists.
void getCommunicationGraph(const PartitioningSolution< Adapter > *solution, ArrayRCP< mj_part_t > &comXAdj, ArrayRCP< mj_part_t > &comAdj)
returns communication graph resulting from MJ partitioning.
SparseMatrixAdapter_t::part_t part_t
Multi Jagged coordinate partitioning algorithm.
#define epsilon
Definition: nd.cpp:82
This class provides geometric coordinates with optional weights to the Zoltan2 algorithm.
void boxAssign(int dim, adapter_scalar_t *lower, adapter_scalar_t *upper, size_t &nPartsFound, mj_part_t **partsFound) const
tuple root
Definition: validXML.py:24
Kokkos::View< scalar_t *, device_t > cut_coordinates
A PartitioningSolution is a solution to a partitioning problem.
Zoltan2_BoxBoundaries()
Default Constructor.
Kokkos::View< index_t *, device_t > permutations
void set_partitioning_parameters(bool distribute_points_on_cut_lines_, int max_concurrent_part_calculation_, int check_migrate_avoid_migration_option_, double minimum_migration_imbalance_, int migration_type_=0)
Multi Jagged coordinate partitioning algorithm.
KOKKOS_INLINE_FUNCTION value_type & reference() const
size_t getCoordinatesKokkos(Kokkos::View< const gno_t *, typename node_t::device_type > &Ids, Kokkos::View< scalar_t **, Kokkos::LayoutLeft, typename node_t::device_type > &xyz, Kokkos::View< scalar_t **, typename node_t::device_type > &wgts) const
Returns the coordinate ids, values and optional weights.
AlgMJ()
Multi Jagged coordinate partitioning algorithm default constructor.
Kokkos::View< part_t *, device_t > parts
Zoltan2_BoxBoundaries is a reduction operation to all reduce the all box boundaries.
ReduceWeightsFunctor(int mj_loop_count, array_t mj_max_scalar, part_t mj_concurrent_current_part, part_t mj_num_cuts, part_t mj_current_work_part, part_t mj_current_concurrent_num_parts, part_t mj_left_right_array_size, part_t mj_weight_array_size, Kokkos::View< index_t *, device_t > &mj_permutations, Kokkos::View< scalar_t *, device_t > &mj_coordinates, Kokkos::View< scalar_t **, device_t > &mj_weights, Kokkos::View< part_t *, device_t > &mj_parts, Kokkos::View< scalar_t *, device_t > &mj_cut_coordinates, Kokkos::View< index_t *, device_t > &mj_part_xadj, bool mj_uniform_weights0, scalar_t mj_sEpsilon)
Algorithm defines the base class for all algorithms.
map_t::local_ordinal_type lno_t
Definition: mapRemotes.cpp:17
mj_partBoxVector_t & getPartBoxesView() const
for partitioning methods, return bounding boxes of the
#define Z2_ASSERT_VALUE(actual, expected)
Throw an error when actual value is not equal to expected value.
RCP< mj_partBoxVector_t > get_kept_boxes() const
DOCWORK: Documentation.
static RCP< Teuchos::AnyNumberParameterEntryValidator > getAnyIntValidator()
Exists to make setting up validators less cluttered.
Kokkos::View< part_t *, device_t > parts
RCP< mj_partBoxVector_t > compute_global_box_boundaries(RCP< mj_partBoxVector_t > &localPartBoxes) const
DOCWORK: Documentation.
uMultiSortItem(IT index_, CT count_, WT *vals_)
KOKKOS_INLINE_FUNCTION ArrayCombinationReducer(scalar_t mj_max_scalar, value_type &val, int mj_value_count_rightleft, int mj_value_count_weights)
KOKKOS_INLINE_FUNCTION void init(value_type dst) const
size_t getLocalNumCoordinates() const
Returns the number of coordinates on this process.
int getNumWeightsPerCoordinate() const
Returns the number (0 or greater) of weights per coordinate.
Define IntegerRangeList validator.
size_t team_shmem_size(int team_size) const
Defines the CoordinateModel classes.
void multi_jagged_part(const RCP< const Environment > &env, RCP< const Comm< int > > &problemComm, double imbalance_tolerance, int num_teams, size_t num_global_parts, Kokkos::View< mj_part_t *, Kokkos::HostSpace > &part_no_array, int recursion_depth, int coord_dim, mj_lno_t num_local_coords, mj_gno_t num_global_coords, Kokkos::View< const mj_gno_t *, device_t > &initial_mj_gnos, Kokkos::View< mj_scalar_t **, Kokkos::LayoutLeft, device_t > &mj_coordinates, int num_weights_per_coord, Kokkos::View< bool *, Kokkos::HostSpace > &mj_uniform_weights, Kokkos::View< mj_scalar_t **, device_t > &mj_weights, Kokkos::View< bool *, Kokkos::HostSpace > &mj_uniform_parts, Kokkos::View< mj_part_t *, device_t > &result_assigned_part_ids, Kokkos::View< mj_gno_t *, device_t > &result_mj_gnos)
Multi Jagged coordinate partitioning algorithm.
KOKKOS_INLINE_FUNCTION void join(value_type &dst, const value_type &src) const
Kokkos::View< scalar_t * > scalar_view_t
Tpetra::global_size_t global_size_t
Zoltan2_MJArrayType< scalar_t > value_type
mj_part_t pointAssign(int dim, adapter_scalar_t *point) const
KOKKOS_INLINE_FUNCTION void init(value_type &dst) const
#define Z2_THROW_OUTSIDE_ERROR(env)
Throw an error returned from outside the Zoltan2 library.
int getCoordinateDim() const
Returns the dimension of the coordinates.
Kokkos::View< scalar_t *, device_t > coordinates
ReduceArrayFunctor(part_t mj_concurrent_current_part, part_t mj_weight_array_size, Kokkos::View< index_t *, device_t > &mj_permutations, Kokkos::View< scalar_t *, device_t > &mj_coordinates, Kokkos::View< part_t *, device_t > &mj_parts, Kokkos::View< index_t *, device_t > &mj_part_xadj, Kokkos::View< index_t *, device_t > &mj_track_on_cuts)
Kokkos::View< index_t *, device_t > part_xadj
KOKKOS_INLINE_FUNCTION Zoltan2_MJArrayType(scalar_t *pSetPtr)
A gathering of useful namespace methods.
void uqsort(IT n, uSortItem< IT, WT > *arr)
Quick sort function. Sorts the arr of uSortItems, with respect to increasing vals. DOCWORK: Document input params.
Contains Teuchos redcution operators for the Multi-jagged algorthm.
RCP< mj_partBox_t > get_global_box() const
DOCWORK: Documentation.
KOKKOS_INLINE_FUNCTION void join(volatile value_type &dst, const volatile value_type &src) const
size_t team_shmem_size(int team_size) const
Multi Jagged coordinate partitioning algorithm.
static void getValidParameters(ParameterList &pl)
Set up validators specific to this algorithm.
void sequential_task_partitioning(const RCP< const Environment > &env, mj_lno_t num_total_coords, mj_lno_t num_selected_coords, size_t num_target_part, int coord_dim, Kokkos::View< mj_scalar_t **, Kokkos::LayoutLeft, device_t > &mj_coordinates_, Kokkos::View< mj_lno_t *, device_t > &initial_selected_coords_output_permutation, mj_lno_t *output_xadj, int recursion_depth_, const Kokkos::View< mj_part_t *, Kokkos::HostSpace > &part_no_array, bool partition_along_longest_dim, int num_ranks_per_node, bool divide_to_prime_first_, mj_part_t num_first_level_parts_=1, const Kokkos::View< mj_part_t *, Kokkos::HostSpace > &first_level_distribution_=Kokkos::View< mj_part_t *, Kokkos::HostSpace >())
Special function for partitioning for task mapping. Runs sequential, and performs deterministic parti...