2 #ifndef _ZOLTAN2_COORD_PARTITIONMAPPING_HPP_
3 #define _ZOLTAN2_COORD_PARTITIONMAPPING_HPP_
13 #include "Teuchos_ArrayViewDecl.hpp"
16 #include "Teuchos_ReductionOp.hpp"
24 #include "Teuchos_Comm.hpp"
25 #ifdef HAVE_ZOLTAN2_MPI
26 #include "Teuchos_DefaultMpiComm.hpp"
27 #endif // HAVE_ZOLTAN2_MPI
28 #include <Teuchos_DefaultSerialComm.hpp>
39 template <
typename Ordinal,
typename T>
55 T inoutBuffer[])
const {
57 for (Ordinal i = 0; i < count; i++) {
58 if (inBuffer[0] - inoutBuffer[0] < -epsilon) {
59 inoutBuffer[0] = inBuffer[0];
60 inoutBuffer[1] = inBuffer[1];
61 }
else if(inBuffer[0] - inoutBuffer[0] < epsilon &&
62 inBuffer[1] - inoutBuffer[1] < epsilon) {
63 inoutBuffer[0] = inBuffer[0];
64 inoutBuffer[1] = inBuffer[1];
75 template <
typename it>
77 return (x == 1 ? x : x * z2Fact<it>(x - 1));
80 template <
typename gno_t,
typename part_t>
88 template <
typename IT>
98 fact[k] = fact[k - 1] * k;
101 for (k = 0; k < n; ++k)
103 perm[k] = i / fact[n - 1 - k];
104 i = i % fact[n - 1 - k];
109 for (k = n - 1; k > 0; --k)
110 for (j = k - 1; j >= 0; --j)
111 if (perm[j] <= perm[k])
117 template <
typename part_t>
121 std::vector<int> grid_dims) {
122 int dim = grid_dims.size();
123 int neighborCount = 2 * dim;
124 task_comm_xadj =
new part_t[taskCount + 1];
125 task_comm_adj =
new part_t[taskCount * neighborCount];
128 task_comm_xadj[0] = 0;
129 for (
part_t i = 0; i < taskCount; ++i) {
131 for (
int j = 0; j < dim; ++j) {
132 part_t lNeighbor = i - prevDimMul;
133 part_t rNeighbor = i + prevDimMul;
134 prevDimMul *= grid_dims[j];
135 if (lNeighbor >= 0 &&
136 lNeighbor/ prevDimMul == i / prevDimMul &&
137 lNeighbor < taskCount) {
138 task_comm_adj[neighBorIndex++] = lNeighbor;
140 if (rNeighbor >= 0 &&
141 rNeighbor/ prevDimMul == i / prevDimMul &&
142 rNeighbor < taskCount) {
143 task_comm_adj[neighBorIndex++] = rNeighbor;
146 task_comm_xadj[i + 1] = neighBorIndex;
151 template <
typename Adapter,
typename scalar_t,
typename part_t>
154 const Teuchos::Comm<int> *comm,
160 scalar_t **partCenters) {
166 ArrayView<const gno_t> gnos;
167 ArrayView<input_t> xyz;
168 ArrayView<input_t> wgts;
176 gno_t *point_counts =
new gno_t[ntasks];
177 memset(point_counts, 0,
sizeof(gno_t) * ntasks);
180 gno_t *global_point_counts =
new gno_t[ntasks];
182 scalar_t **multiJagged_coordinates =
new scalar_t*[coordDim];
184 ArrayRCP<ArrayRCP<const scalar_t>> ar = arcp(
new ArrayRCP<const scalar_t>[coordDim], 0, coordDim);
185 for (
int dim=0; dim < coordDim; dim++){
186 xyz[dim].getInputArray(ar[dim]);
188 multiJagged_coordinates[dim] = (scalar_t *)ar[dim].getRawPtr();
189 memset(partCenters[dim], 0,
sizeof(scalar_t) * ntasks);
197 for (lno_t i=0; i < numLocalCoords; i++) {
200 for(
int j = 0; j < coordDim; ++j) {
201 scalar_t c = multiJagged_coordinates[j][i];
202 partCenters[j][p] += c;
208 reduceAll<int, gno_t>(*comm, Teuchos::REDUCE_SUM,
209 ntasks, point_counts, global_point_counts);
211 for(
int j = 0; j < coordDim; ++j) {
212 for (
part_t i=0; i < ntasks; ++i) {
213 if (global_point_counts[i] > 0)
214 partCenters[j][i] /= global_point_counts[i];
218 scalar_t *tmpCoords =
new scalar_t[ntasks];
219 for(
int j = 0; j < coordDim; ++j) {
220 reduceAll<int, scalar_t>(*comm, Teuchos::REDUCE_SUM,
221 ntasks, partCenters[j], tmpCoords);
223 scalar_t *tmp = partCenters[j];
224 partCenters[j] = tmpCoords;
230 delete [] point_counts;
231 delete [] global_point_counts;
234 delete [] multiJagged_coordinates;
238 template <
typename Adapter,
typename scalar_t,
typename part_t>
241 const Teuchos::Comm<int> *comm,
246 ArrayRCP<part_t> &g_part_xadj,
247 ArrayRCP<part_t> &g_part_adj,
248 ArrayRCP<scalar_t> &g_part_ew) {
252 typedef typename Adapter::scalar_t t_scalar_t;
253 typedef typename Adapter::offset_t t_offset_t;
275 ArrayView<const t_gno_t> Ids;
276 ArrayView<t_input_t> v_wghts;
280 ArrayView<const t_gno_t> edgeIds;
281 ArrayView<const t_offset_t> offsets;
282 ArrayView<t_input_t> e_wgts;
285 std::vector<t_scalar_t> edge_weights;
288 if (numWeightPerEdge > 0) {
289 edge_weights = std::vector<t_scalar_t>(localNumEdges);
290 for (t_lno_t i = 0; i < localNumEdges; ++i) {
291 edge_weights[i] = e_wgts[0][i];
297 std::vector<part_t> e_parts(localNumEdges);
298 #ifdef HAVE_ZOLTAN2_MPI
299 if (comm->getSize() > 1) {
300 const bool bUseLocalIDs =
false;
303 const RCP<const Comm<int> > rcp_comm(comm,
false);
304 directory_t directory(rcp_comm, bUseLocalIDs, debug_level);
305 directory.update(localNumVertices, &Ids[0], NULL, &parts[0],
307 directory.find(localNumEdges, &edgeIds[0], NULL, &e_parts[0],
320 for (t_lno_t i = 0; i < localNumEdges; ++i) {
321 t_gno_t ei = edgeIds[i];
327 std::vector<t_lno_t> part_begins(np, -1);
328 std::vector<t_lno_t> part_nexts(localNumVertices, -1);
332 for (t_lno_t i = 0; i < localNumVertices; ++i) {
334 part_nexts[i] = part_begins[ap];
339 g_part_xadj = ArrayRCP<part_t>(np + 1);
340 g_part_adj = ArrayRCP<part_t>(localNumEdges);
341 g_part_ew = ArrayRCP<t_scalar_t>(localNumEdges);
344 std::vector<part_t> part_neighbors(np);
345 std::vector<t_scalar_t> part_neighbor_weights(np, 0);
346 std::vector<t_scalar_t> part_neighbor_weights_ordered(np);
349 for (t_lno_t i = 0; i < np; ++i) {
350 part_t num_neighbor_parts = 0;
351 t_lno_t v = part_begins[i];
355 for (t_offset_t j = offsets[v]; j < offsets[v+1]; ++j) {
360 if (numWeightPerEdge > 0) {
361 ew = edge_weights[j];
368 if (part_neighbor_weights[ep] < 0.00001) {
369 part_neighbors[num_neighbor_parts++] = ep;
372 part_neighbor_weights[ep] += ew;
380 for (t_lno_t j = 0; j < num_neighbor_parts; ++j) {
381 part_t neighbor_part = part_neighbors[j];
382 g_part_adj[nindex] = neighbor_part;
383 g_part_ew[nindex++] = part_neighbor_weights[neighbor_part];
384 part_neighbor_weights[neighbor_part] = 0;
386 g_part_xadj[i + 1] = nindex;
390 #ifdef HAVE_ZOLTAN2_MPI
391 if(comm->getSize() > 1) {
397 part_info() : weight(0) {
400 const part_info & operator+=(
const part_info & src) {
401 this->weight += src.weight;
405 bool operator>(
const part_info & src) {
406 return (destination_part > src.destination_part);
409 bool operator==(
const part_info & src) {
410 return (destination_part == src.destination_part);
417 bool bUseLocalIDs =
false;
418 const int debug_level = 0;
421 const RCP<const Comm<int> > rcp_comm(comm,
false);
422 directory_t directory(rcp_comm, bUseLocalIDs, debug_level);
423 std::vector<part_t> part_data;
424 std::vector<std::vector<part_info>> user_data;
429 std::vector<t_lno_t> part_begins(np, -1);
430 std::vector<t_lno_t> part_nexts(localNumVertices, -1);
434 for (t_lno_t i = 0; i < localNumVertices; ++i) {
436 part_nexts[i] = part_begins[ap];
440 std::vector<part_t> part_neighbors(np);
441 std::vector<t_scalar_t> part_neighbor_weights(np, 0);
442 std::vector<t_scalar_t> part_neighbor_weights_ordered(np);
445 for (t_lno_t i = 0; i < np; ++i) {
446 part_t num_neighbor_parts = 0;
447 t_lno_t v = part_begins[i];
452 for (t_offset_t j = offsets[v]; j < offsets[v+1]; ++j) {
457 if (numWeightPerEdge > 0) {
458 ew = edge_weights[j];
462 if (part_neighbor_weights[ep] < 0.00001) {
463 part_neighbors[num_neighbor_parts++] = ep;
466 part_neighbor_weights[ep] += ew;
473 for (t_lno_t j = 0; j < num_neighbor_parts; ++j) {
474 part_t neighbor_part = part_neighbors[j];
475 part_neighbor_weights_ordered[j] =
476 part_neighbor_weights[neighbor_part];
477 part_neighbor_weights[neighbor_part] = 0;
481 if (num_neighbor_parts > 0) {
482 part_data.push_back(i);
483 std::vector<part_info> new_user_data(num_neighbor_parts);
484 for(
int q = 0; q < num_neighbor_parts; ++q) {
485 part_info & info = new_user_data[q];
486 info.weight = part_neighbor_weights_ordered[q];
487 info.destination_part = part_neighbors[q];
490 user_data.push_back(new_user_data);
496 std::vector<part_t> part_indices(np);
498 for (
part_t i = 0; i < np; ++i) part_indices[i] = i;
501 directory.update(part_data.size(), &part_data[0], NULL, &user_data[0],
502 NULL, directory_t::Update_Mode::AggregateAdd);
506 std::vector<std::vector<part_info>> find_user_data(part_indices.size());
507 directory.find(find_user_data.size(), &part_indices[0], NULL,
508 &find_user_data[0], NULL, NULL,
false);
518 int get_total_length = 0;
519 for (
size_t n = 0; n < find_user_data.size(); ++n) {
520 get_total_length += find_user_data[n].size();
524 g_part_xadj = ArrayRCP<part_t>(np + 1);
525 g_part_adj = ArrayRCP<part_t>(get_total_length);
526 g_part_ew = ArrayRCP<t_scalar_t>(get_total_length);
529 int track_insert_index = 0;
530 for(
size_t n = 0; n < find_user_data.size(); ++n) {
531 g_part_xadj[n] = track_insert_index;
532 const std::vector<part_info> & user_data_vector = find_user_data[n];
533 for(
size_t q = 0; q < user_data_vector.size(); ++q) {
534 const part_info & info = user_data_vector[q];
535 g_part_adj[track_insert_index] = info.destination_part;
536 g_part_ew[track_insert_index] = info.weight;
537 ++track_insert_index;
540 g_part_xadj[np] = get_total_length;
542 #endif // HAVE_ZOLTAN2_MPI
548 template <
class IT,
class WT>
559 delete [] this->indices;
560 delete [] this->values;
564 this->heapSize = heapsize_;
565 this->indices =
new IT[heapsize_];
566 this->values =
new WT[heapsize_];
571 WT maxVal = this->values[0];
576 if (distance >= maxVal)
579 this->values[0] = distance;
580 this->indices[0] = index;
587 IT child_index1 = 2 * index_on_heap + 1;
588 IT child_index2 = 2 * index_on_heap + 2;
591 if(child_index1 < this->heapSize && child_index2 < this->heapSize) {
593 if (this->values[child_index1] < this->values[child_index2]) {
594 biggerIndex = child_index2;
597 biggerIndex = child_index1;
600 else if(child_index1 < this->heapSize) {
601 biggerIndex = child_index1;
604 else if(child_index2 < this->heapSize) {
605 biggerIndex = child_index2;
607 if (biggerIndex >= 0 &&
608 this->values[biggerIndex] > this->values[index_on_heap]) {
609 WT tmpVal = this->values[biggerIndex];
610 this->values[biggerIndex] = this->values[index_on_heap];
611 this->values[index_on_heap] = tmpVal;
613 IT tmpIndex = this->indices[biggerIndex];
614 this->indices[biggerIndex] = this->indices[index_on_heap];
615 this->indices[index_on_heap] = tmpIndex;
621 WT MAXVAL = std::numeric_limits<WT>::max();
623 for(IT i = 0; i < this->heapSize; ++i) {
624 this->values[i] = MAXVAL;
625 this->indices[i] = -1;
633 for(IT j = 0; j < this->heapSize; ++j) {
634 nc += this->values[j];
646 for(
int i = 0; i < dimension; ++i) {
649 for(IT j = 0; j < this->heapSize; ++j) {
650 IT k = this->indices[j];
658 nc /= this->heapSize;
659 moved = (std::abs(center[i] - nc) > this->epsilon || moved );
666 for (IT i = 0; i < this->heapSize; ++i) {
667 permutation[i] = this->indices[i];
674 template <
class IT,
class WT>
689 this->dimension = dimension_;
690 this->
center =
new WT[dimension_];
691 this->closestPoints.setHeapsize(heapsize);
695 this->closestPoints.initValues();
699 return this->closestPoints.getNewCenters(
center, coords, dimension);
707 for (
int i = 0; i < this->dimension; ++i) {
708 WT d = (
center[i] - elementCoords[i][index]);
711 distance = pow(distance, WT(1.0 / this->dimension));
712 closestPoints.addPoint(index, distance);
718 return closestPoints.getTotalDistance();
722 closestPoints.copyCoordinates(permutation);
732 template <
class IT,
class WT>
739 IT required_elements;
747 delete [] maxCoordinates;
748 delete [] minCoordinates;
757 IT required_elements_):
759 numElements(numElements_),
760 elementCoords(elementCoords_),
761 numClusters((1 << dim_) + 1),
762 required_elements(required_elements_) {
766 for (
int i = 0; i < numClusters; ++i) {
767 this->clusters[i].setParams(this->dim, this->required_elements);
770 this->maxCoordinates =
new WT[this->dim];
771 this->minCoordinates =
new WT[this->dim];
774 for (
int j = 0; j < dim; ++j) {
775 this->minCoordinates[j] = this->elementCoords[j][0];
776 this->maxCoordinates[j] = this->elementCoords[j][0];
778 for(IT i = 1; i < numElements; ++i) {
779 WT t = this->elementCoords[j][i];
780 if(t > this->maxCoordinates[j]){
781 this->maxCoordinates[j] = t;
784 if (t < minCoordinates[j]) {
785 this->minCoordinates[j] = t;
792 for (
int j = 0; j < dim; ++j) {
793 int mod = (1 << (j+1));
794 for (
int i = 0; i < numClusters - 1; ++i) {
797 if ( (i % mod) < mod / 2) {
798 c = this->maxCoordinates[j];
803 c = this->minCoordinates[j];
806 this->clusters[i].center[j] = c;
811 for (
int j = 0; j < dim; ++j) {
812 this->clusters[numClusters - 1].center[j] =
813 (this->maxCoordinates[j] + this->minCoordinates[j]) / 2;
830 for (
int it = 0; it < 10; ++it) {
833 for (IT j = 0; j < this->numClusters; ++j) {
834 this->clusters[j].clearHeap();
837 for (IT i = 0; i < this->numElements; ++i) {
840 for (IT j = 0; j < this->numClusters; ++j) {
844 this->clusters[j].getDistance(i,this->elementCoords);
850 for (IT j = 0; j < this->numClusters; ++j) {
852 (this->clusters[j].getNewCenters(this->elementCoords) || moved );
864 WT minDistance = this->clusters[0].getDistanceToCenter();
871 for (IT j = 1; j < this->numClusters; ++j) {
872 WT minTmpDistance = this->clusters[j].getDistanceToCenter();
878 if (minTmpDistance < minDistance) {
879 minDistance = minTmpDistance;
885 this->clusters[minCluster].copyCoordinates(procPermutation);
890 #define MINOF(a,b) (((a)<(b))?(a):(b))
899 template <
typename T>
903 #ifdef HAVE_ZOLTAN2_OMP
904 #pragma omp parallel for
906 for (
size_t i = 0; i < arrSize; ++i) {
913 #ifdef HAVE_ZOLTAN2_OMP
914 #pragma omp parallel for
916 for (
size_t i = 0; i < arrSize; ++i) {
926 template <
typename part_t,
typename pcoord_t,
typename node_t>
957 part_t *task_communication_xadj,
958 part_t *task_communication_adj,
959 pcoord_t *task_communication_edge_weight) {
961 double totalCost = 0;
965 int assigned_proc = task_to_proc[task];
967 part_t task_adj_begin = task_communication_xadj[task];
968 part_t task_adj_end = task_communication_xadj[task + 1];
970 commCount += task_adj_end - task_adj_begin;
972 for (
part_t task2 = task_adj_begin; task2 < task_adj_end; ++task2) {
974 part_t neighborTask = task_communication_adj[task2];
975 int neighborProc = task_to_proc[neighborTask];
978 if (task_communication_edge_weight == NULL) {
979 totalCost += distance ;
982 totalCost += distance * task_communication_edge_weight[task2];
1008 const RCP<const Environment> &env,
1009 ArrayRCP <part_t> &proc_to_task_xadj,
1010 ArrayRCP <part_t> &proc_to_task_adj,
1011 ArrayRCP <part_t> &task_to_proc,
1012 const Teuchos::RCP <
const Teuchos::Comm<int> > comm_
1020 template <
typename pcoord_t,
typename tcoord_t,
typename part_t,
typename node_t>
1076 pcoord_t **pcoords_,
1078 tcoord_t **tcoords_,
1081 int *machine_extent_,
1082 bool *machine_extent_wrap_around_,
1117 minCoordDim, nprocs,
1123 for(
int i = ntasks; i < nprocs; ++i) {
1124 proc_permutation[i] = -1;
1151 lo = _u_umpa_seed % q;
1152 hi = _u_umpa_seed / q;
1153 test = (a * lo) - (r * hi);
1155 _u_umpa_seed = test;
1157 _u_umpa_seed = test + m;
1159 d = (double) ((
double) _u_umpa_seed / (double) m);
1161 return (
part_t) (d*(double)l);
1165 pcoord_t distance = 0;
1179 this->
machine->getHopCount(procId1, procId2, distance);
1188 int &_u_umpa_seed,
part_t rndm) {
1200 for (i = 0; i < n; i += 16) {
1207 ZOLTAN2_ALGMULTIJAGGED_SWAP(a[v], a[u], tmp);
1208 ZOLTAN2_ALGMULTIJAGGED_SWAP(a[v + 1], a[u + 1], tmp);
1209 ZOLTAN2_ALGMULTIJAGGED_SWAP(a[v + 2], a[u + 2], tmp);
1210 ZOLTAN2_ALGMULTIJAGGED_SWAP(a[v + 3], a[u + 3], tmp);
1216 for (i = 1; i < end; i++) {
1238 const RCP<const Environment> &env,
1239 ArrayRCP <part_t> &rcp_proc_to_task_xadj,
1240 ArrayRCP <part_t> &rcp_proc_to_task_adj,
1241 ArrayRCP <part_t> &rcp_task_to_proc,
1242 const Teuchos::RCP <
const Teuchos::Comm<int> > comm_
1245 rcp_proc_to_task_xadj = ArrayRCP <part_t>(this->
no_procs + 1);
1246 rcp_proc_to_task_adj = ArrayRCP <part_t>(this->
no_tasks);
1247 rcp_task_to_proc = ArrayRCP <part_t>(this->
no_tasks);
1250 part_t *proc_to_task_xadj = rcp_proc_to_task_xadj.getRawPtr();
1253 part_t *proc_to_task_adj = rcp_proc_to_task_adj.getRawPtr();
1256 part_t *task_to_proc = rcp_task_to_proc.getRawPtr();
1259 fillContinousArray<part_t> (proc_to_task_xadj, this->
no_procs + 1, &invalid);
1286 recursion_depth = log(
float(this->no_procs)) / log(2.0) + 1;
1299 taskPerm = z2Fact<int>(8);
1306 procPerm = z2Fact<int>(8);
1310 int permutations = taskPerm * procPerm;
1314 permutations += taskPerm;
1318 permutations += procPerm;
1336 if (this->no_procs > this->
no_tasks) {
1344 fillContinousArray<part_t>(proc_adjList,this->
no_procs, NULL);
1348 int myPermutation = myRank % permutations;
1349 bool task_partition_along_longest_dim =
false;
1350 bool proc_partition_along_longest_dim =
false;
1356 if (myPermutation == 0) {
1357 task_partition_along_longest_dim =
true;
1358 proc_partition_along_longest_dim =
true;
1362 if (myPermutation < taskPerm) {
1363 proc_partition_along_longest_dim =
true;
1365 myTaskPerm = myPermutation;
1368 myPermutation -= taskPerm;
1369 if (myPermutation < procPerm) {
1370 task_partition_along_longest_dim =
true;
1372 myProcPerm = myPermutation;
1375 myPermutation -= procPerm;
1377 myProcPerm = myPermutation % procPerm;
1379 myTaskPerm = myPermutation / procPerm;
1409 ithPermutation<int>(this->
proc_coord_dim, myProcPerm, permutation);
1411 ithPermutation<int>(8, myProcPerm, permutation);
1479 part_t num_group_count = 1;
1480 part_t *group_count = NULL;
1483 num_group_count =
machine->getNumUniqueGroups();
1485 if (num_group_count > 1) {
1486 group_count =
new part_t[num_group_count];
1487 memset(group_count, 0,
sizeof(
part_t) * num_group_count);
1489 machine->getGroupCount(group_count);
1493 env->timerStart(
MACRO_TIMERS,
"Mapping - Proc Partitioning");
1497 typedef typename node_t::device_type
device_t;
1499 Kokkos::View<pcoord_t**, Kokkos::LayoutLeft, device_t>
1500 kokkos_pcoords(
"pcoords", this->no_procs, procdim);
1501 auto host_kokkos_pcoords = Kokkos::create_mirror_view(kokkos_pcoords);
1502 for(
int i = 0; i < procdim; ++i) {
1503 for(
int j = 0; j < this->
no_procs; ++j) {
1504 host_kokkos_pcoords(j,i) = pcoords[i][j];
1507 Kokkos::deep_copy(kokkos_pcoords, host_kokkos_pcoords);
1509 Kokkos::View<part_t*, device_t> initial_selected_coords_output_permutation_pcoords(
1510 "initial_selected_coords_output_permutation_pcoords", this->no_procs);
1511 typename Kokkos::View<part_t*, device_t>::HostMirror
1512 host_initial_selected_coords_output_permutation_pcoords =
1513 Kokkos::create_mirror_view(initial_selected_coords_output_permutation_pcoords);
1514 for(
int n = 0; n < this->
no_procs; ++n) {
1515 host_initial_selected_coords_output_permutation_pcoords(n) =
1518 Kokkos::deep_copy(initial_selected_coords_output_permutation_pcoords,
1519 host_initial_selected_coords_output_permutation_pcoords);
1522 Kokkos::View<part_t *, Kokkos::HostSpace> kokkos_group_count(
1523 "kokkos_group_count", group_count ? num_group_count : 0);
1525 for(
int n = 0; n < num_group_count; ++n) {
1526 kokkos_group_count(n) = group_count[n];
1538 initial_selected_coords_output_permutation_pcoords,
1542 proc_partition_along_longest_dim,
1546 kokkos_group_count);
1547 env->timerStop(
MACRO_TIMERS,
"Mapping - Proc Partitioning");
1552 Kokkos::deep_copy(host_initial_selected_coords_output_permutation_pcoords,
1553 initial_selected_coords_output_permutation_pcoords);
1554 for(
int n = 0; n < this->
no_procs; ++n) {
1556 host_initial_selected_coords_output_permutation_pcoords(n);
1563 fillContinousArray<part_t>(task_adjList,this->
no_tasks, NULL);
1567 ithPermutation<int>(this->
task_coord_dim, myTaskPerm, permutation);
1569 ithPermutation<int>(8, myTaskPerm, permutation);
1578 Kokkos::View<tcoord_t**, Kokkos::LayoutLeft, device_t>
1579 kokkos_tcoords(
"tcoords", this->no_tasks, this->task_coord_dim);
1580 auto host_kokkos_tcoords = Kokkos::create_mirror_view(kokkos_tcoords);
1582 for(
int j = 0; j < this->
no_tasks; ++j) {
1583 host_kokkos_tcoords(j,i) = tcoords[i][j];
1586 Kokkos::deep_copy(kokkos_tcoords, host_kokkos_tcoords);
1588 env->timerStart(
MACRO_TIMERS,
"Mapping - Task Partitioning");
1590 Kokkos::View<part_t*, device_t> initial_selected_coords_output_permutation_tcoords(
1591 "initial_selected_coords_output_permutation_tcoords", this->no_tasks);
1592 typename Kokkos::View<part_t*, device_t>::HostMirror
1593 host_initial_selected_coords_output_permutation_tcoords =
1594 Kokkos::create_mirror_view(initial_selected_coords_output_permutation_tcoords);
1595 for(
int n = 0; n < this->
no_tasks; ++n) {
1596 host_initial_selected_coords_output_permutation_tcoords(n) =
1599 Kokkos::deep_copy(initial_selected_coords_output_permutation_tcoords,
1600 host_initial_selected_coords_output_permutation_tcoords);
1608 this->task_coord_dim,
1611 initial_selected_coords_output_permutation_tcoords,
1615 task_partition_along_longest_dim,
1619 kokkos_group_count);
1620 env->timerStop(
MACRO_TIMERS,
"Mapping - Task Partitioning");
1622 Kokkos::deep_copy(host_initial_selected_coords_output_permutation_tcoords,
1623 initial_selected_coords_output_permutation_tcoords);
1624 for(
int n = 0; n < this->
no_tasks; ++n) {
1626 host_initial_selected_coords_output_permutation_tcoords(n);
1635 delete [] permutation;
1639 for(
part_t i = 0; i < num_parts; ++i) {
1641 part_t proc_index_begin = proc_xadj[i];
1642 part_t task_begin_index = task_xadj[i];
1643 part_t proc_index_end = proc_xadj[i + 1];
1644 part_t task_end_index = task_xadj[i + 1];
1647 if(proc_index_end - proc_index_begin != 1) {
1648 std::cerr <<
"Error at partitioning of processors" << std::endl;
1649 std::cerr <<
"PART:" << i <<
" is assigned to "
1650 << proc_index_end - proc_index_begin <<
" processors."
1654 part_t assigned_proc = proc_adjList[proc_index_begin];
1655 proc_to_task_xadj[assigned_proc] = task_end_index - task_begin_index;
1663 part_t tmp = proc_to_task_xadj[i];
1664 proc_to_task_xadj[i] = sum;
1666 proc_to_task_xadj_work[i] = sum;
1668 proc_to_task_xadj[this->
no_procs] = sum;
1670 for(
part_t i = 0; i < num_parts; ++i){
1672 part_t proc_index_begin = proc_xadj[i];
1673 part_t task_begin_index = task_xadj[i];
1674 part_t task_end_index = task_xadj[i + 1];
1676 part_t assigned_proc = proc_adjList[proc_index_begin];
1678 for (
part_t j = task_begin_index; j < task_end_index; ++j) {
1679 part_t taskId = task_adjList[j];
1681 task_to_proc[taskId] = assigned_proc;
1683 proc_to_task_adj [--proc_to_task_xadj_work[assigned_proc]] = taskId;
1740 delete [] proc_to_task_xadj_work;
1741 delete [] task_xadj;
1742 delete [] task_adjList;
1743 delete [] proc_xadj;
1744 delete [] proc_adjList;
1749 template <
typename Adapter,
typename part_t>
1753 #ifndef DOXYGEN_SHOULD_SKIP_THIS
1755 typedef typename Adapter::scalar_t pcoord_t;
1756 typedef typename Adapter::scalar_t tcoord_t;
1757 typedef typename Adapter::scalar_t
scalar_t;
1760 #if defined(KOKKOS_ENABLE_CUDA)
1761 using node_t = Tpetra::KokkosCompat::KokkosDeviceWrapperNode<
1762 Kokkos::Cuda, Kokkos::CudaSpace>;
1763 #elif defined(KOKKOS_ENABLE_HIP)
1764 using node_t = Tpetra::KokkosCompat::KokkosDeviceWrapperNode<
1765 Kokkos::HIP, Kokkos::HIPSpace>;
1803 const Teuchos::RCP<
const Teuchos::Comm<int> > comm_) {
1819 std::cerr <<
"communicationModel is not specified in the Mapper"
1839 taskPerm = z2Fact<int>(taskDim);
1842 taskPerm = z2Fact<int>(8);
1846 procPerm = z2Fact<int>(procDim);
1849 procPerm = z2Fact<int>(8);
1852 int idealGroupSize = taskPerm * procPerm;
1855 idealGroupSize += taskPerm + procPerm + 1;
1857 int myRank = this->
comm->getRank();
1858 int commSize = this->
comm->getSize();
1860 int myGroupIndex = myRank / idealGroupSize;
1862 int prevGroupBegin = (myGroupIndex - 1)* idealGroupSize;
1863 if (prevGroupBegin < 0) prevGroupBegin = 0;
1864 int myGroupBegin = myGroupIndex * idealGroupSize;
1865 int myGroupEnd = (myGroupIndex + 1) * idealGroupSize;
1866 int nextGroupEnd = (myGroupIndex + 2)* idealGroupSize;
1868 if (myGroupEnd > commSize) {
1869 myGroupBegin = prevGroupBegin;
1870 myGroupEnd = commSize;
1872 if (nextGroupEnd > commSize) {
1873 myGroupEnd = commSize;
1875 int myGroupSize = myGroupEnd - myGroupBegin;
1878 for (
int i = 0; i < myGroupSize; ++i) {
1879 myGroup[i] = myGroupBegin + i;
1884 ArrayView<const part_t> myGroupView(myGroup, myGroupSize);
1886 RCP<Comm<int> > subComm =
1887 this->
comm->createSubcommunicator(myGroupView);
1900 double myCost = this->
proc_task_comm->getCommunicationCostMetric();
1903 double localCost[2], globalCost[2];
1905 localCost[0] = myCost;
1906 localCost[1] = double(subComm->getRank());
1908 globalCost[1] = globalCost[0] = std::numeric_limits<double>::max();
1910 reduceAll<int, double>(*subComm, reduceBest,
1911 2, localCost, globalCost);
1913 int sender = int(globalCost[1]);
1930 broadcast(*subComm, sender, this->
ntasks,
1932 broadcast(*subComm, sender, this->
nprocs,
1934 broadcast(*subComm, sender, this->
ntasks,
1940 std::ofstream gnuPlotCode(
"gnuPlot.plot", std::ofstream::out);
1944 std::string ss =
"";
1947 std::string procFile = Teuchos::toString<int>(i) +
"_mapping.txt";
1949 gnuPlotCode <<
"plot \"" << procFile <<
"\"\n";
1952 gnuPlotCode <<
"replot \"" << procFile <<
"\"\n";
1955 std::ofstream inpFile(procFile.c_str(), std::ofstream::out);
1957 std::string gnuPlotArrow =
"set arrow from ";
1958 for (
int j = 0; j < mindim; ++j) {
1959 if (j == mindim - 1) {
1969 proc_coords[j][i]) +
",";
1972 gnuPlotArrow +=
" to ";
1974 inpFile << std::endl;
1977 for (
int k = 0; k < a.size(); ++k) {
1980 std::string gnuPlotArrow2 = gnuPlotArrow;
1981 for (
int z = 0; z < mindim; ++z) {
1982 if (z == mindim - 1) {
1994 task_coords[z][j]) +
",";
1997 ss += gnuPlotArrow2 +
"\n";
1998 inpFile << std::endl;
2003 gnuPlotCode <<
"\nreplot\n pause -1 \n";
2004 gnuPlotCode.close();
2009 std::string rankStr = Teuchos::toString<int>(myRank);
2010 std::string gnuPlots =
"gnuPlot", extentionS =
".plot";
2011 std::string outF = gnuPlots + rankStr+ extentionS;
2012 std::ofstream gnuPlotCode(outF.c_str(), std::ofstream::out);
2015 *tmpproc_task_comm =
2022 int mindim = tmpproc_task_comm->proc_coord_dim;
2024 std::cerr <<
"Mapping Write is only good for 3 dim" << std::endl;
2027 std::string ss =
"";
2028 std::string procs =
"";
2030 std::set < std::tuple<int,int,int,int,int,int> > my_arrows;
2031 for (part_t origin_rank = 0; origin_rank < this->
nprocs; ++origin_rank) {
2033 if (a.size() == 0) {
2037 std::string gnuPlotArrow =
"set arrow from ";
2038 for (
int j = 0; j < mindim; ++j) {
2039 if (j == mindim - 1) {
2041 Teuchos::toString<float>(tmpproc_task_comm->
2042 proc_coords[j][origin_rank]);
2044 Teuchos::toString<float>(tmpproc_task_comm->
2045 proc_coords[j][origin_rank]);
2050 Teuchos::toString<float>(tmpproc_task_comm->
2051 proc_coords[j][origin_rank]) +
",";
2053 Teuchos::toString<float>(tmpproc_task_comm->
2054 proc_coords[j][origin_rank])+
" ";
2059 gnuPlotArrow +=
" to ";
2062 for (
int k = 0; k < a.size(); ++k) {
2063 int origin_task = a[k];
2069 bool differentnode =
false;
2073 for (
int j = 0; j < mindim; ++j) {
2074 if (
int(tmpproc_task_comm->proc_coords[j][origin_rank]) !=
2075 int(tmpproc_task_comm->proc_coords[j][neighbor_rank])) {
2076 differentnode =
true;
break;
2079 std::tuple<int,int,int, int, int, int> foo(
2080 int(tmpproc_task_comm->proc_coords[0][origin_rank]),
2081 int(tmpproc_task_comm->proc_coords[1][origin_rank]),
2082 int(tmpproc_task_comm->proc_coords[2][origin_rank]),
2083 int(tmpproc_task_comm->proc_coords[0][neighbor_rank]),
2084 int(tmpproc_task_comm->proc_coords[1][neighbor_rank]),
2085 int(tmpproc_task_comm->proc_coords[2][neighbor_rank]));
2088 if (differentnode && my_arrows.find(foo) == my_arrows.end()) {
2089 my_arrows.insert(foo);
2091 std::string gnuPlotArrow2 =
"";
2092 for (
int j = 0; j < mindim; ++j) {
2093 if (j == mindim - 1) {
2095 Teuchos::toString<float>(tmpproc_task_comm->
2096 proc_coords[j][neighbor_rank]);
2100 Teuchos::toString<float>(tmpproc_task_comm->
2101 proc_coords[j][neighbor_rank]) +
",";
2104 ss += gnuPlotArrow + gnuPlotArrow2 +
" nohead\n";
2110 std::ofstream procFile(
"procPlot.plot", std::ofstream::out);
2111 procFile << procs <<
"\n";
2116 gnuPlotCode <<
"plot \"procPlot.plot\" with points pointsize 3\n";
2118 gnuPlotCode <<
"splot \"procPlot.plot\" with points pointsize 3\n";
2121 gnuPlotCode << ss <<
"\nreplot\n pause -1 \n";
2122 gnuPlotCode.close();
2129 const Teuchos::Comm<int> *comm_,
2132 tcoord_t **partCenters) {
2134 std::string file =
"gggnuPlot";
2135 std::string exten =
".plot";
2136 std::ofstream mm(
"2d.txt");
2137 file += Teuchos::toString<int>(comm_->getRank()) + exten;
2138 std::ofstream ff(file.c_str());
2140 std::vector<Zoltan2::coordinateModelPartBox <tcoord_t, part_t> >
2146 outPartBoxes[i].writeGnuPlot(ff, mm);
2148 if (coordDim == 2) {
2149 ff <<
"plot \"2d.txt\"" << std::endl;
2153 ff <<
"splot \"2d.txt\"" << std::endl;
2158 ff <<
"set style arrow 5 nohead size screen 0.03,15,135 ls 1"
2165 for (
part_t p = pb; p < pe; ++p) {
2169 std::string arrowline =
"set arrow from ";
2170 for (
int j = 0; j < coordDim - 1; ++j) {
2172 Teuchos::toString<tcoord_t>(partCenters[j][n]) +
",";
2175 Teuchos::toString<tcoord_t>(partCenters[coordDim - 1][n]) +
2178 for (
int j = 0; j < coordDim - 1; ++j) {
2180 Teuchos::toString<tcoord_t>(partCenters[j][i]) +
",";
2183 Teuchos::toString<tcoord_t>(partCenters[coordDim - 1][i]) +
2191 ff <<
"replot\n pause -1" << std::endl;
2199 part_t* &proc_to_task_adj_) {
2228 const lno_t num_local_coords,
2229 const part_t *local_coord_parts,
2230 const ArrayRCP<part_t> task_to_proc_) {
2233 for (
lno_t i = 0; i < num_local_coords; ++i) {
2234 part_t local_coord_part = local_coord_parts[i];
2235 part_t rank_index = task_to_proc_[local_coord_part];
2256 const Teuchos::RCP <
const Teuchos::Comm<int> > comm_,
2259 const Teuchos::RCP <const Adapter> input_adapter_,
2262 const Teuchos::RCP <const Environment> envConst,
2263 bool is_input_adapter_distributed =
true,
2264 int num_ranks_per_node = 1,
2265 bool divide_to_prime_first =
false,
2266 bool reduce_best_mapping =
true):
2278 using namespace Teuchos;
2281 RCP<Zoltan2::GraphModel<ctm_base_adapter_t> > graph_model_;
2282 RCP<Zoltan2::CoordinateModel<ctm_base_adapter_t> > coordinateModel_ ;
2284 RCP<const Teuchos::Comm<int> > rcp_comm = comm_;
2285 RCP<const Teuchos::Comm<int> > ia_comm = rcp_comm;
2286 if (!is_input_adapter_distributed) {
2287 ia_comm = Teuchos::createSerialComm<int>();
2290 RCP<const Environment> envConst_ = envConst;
2292 RCP<const ctm_base_adapter_t> baseInputAdapter_(
2293 rcp(dynamic_cast<const ctm_base_adapter_t *>(
2294 input_adapter_.getRawPtr()),
false));
2302 baseInputAdapter_, envConst_, ia_comm, coordFlags_));
2312 baseInputAdapter_, envConst_, ia_comm,
2316 if (!machine_->hasMachineCoordinates()) {
2317 throw std::runtime_error(
"Existing machine does not provide "
2318 "coordinates for coordinate task mapping");
2322 int procDim = machine_->getMachineDim();
2323 this->
nprocs = machine_->getNumRanks();
2326 pcoord_t **procCoordinates = NULL;
2327 if (!machine_->getAllMachineCoordinatesView(procCoordinates)) {
2328 throw std::runtime_error(
"Existing machine does not implement "
2329 "getAllMachineCoordinatesView");
2336 std::vector<int> machine_extent_vec(procDim);
2338 int *machine_extent = &(machine_extent_vec[0]);
2339 bool *machine_extent_wrap_around =
new bool[procDim];
2340 for (
int i = 0; i < procDim; ++i)
2341 machine_extent_wrap_around[i] =
false;
2343 bool haveWrapArounds = machine_->getMachineExtentWrapArounds(machine_extent_wrap_around);
2349 if (machine_->getMachineExtent(machine_extent) &&
2356 machine_extent_wrap_around,
2363 int coordDim = coordinateModel_->getCoordinateDim();
2374 tcoord_t **partCenters =
new tcoord_t*[coordDim];
2375 for (
int i = 0; i < coordDim; ++i) {
2376 partCenters[i] =
new tcoord_t[this->
ntasks];
2379 typedef typename Adapter::scalar_t t_scalar_t;
2382 envConst->timerStart(
MACRO_TIMERS,
"Mapping - Solution Center");
2385 getSolutionCenterCoordinates<Adapter, t_scalar_t,part_t>(
2386 envConst.getRawPtr(),
2387 ia_comm.getRawPtr(),
2388 coordinateModel_.getRawPtr(),
2396 envConst->timerStop(
MACRO_TIMERS,
"Mapping - Solution Center");
2399 if (graph_model_.getRawPtr() != NULL) {
2400 getCoarsenedPartGraph<Adapter, t_scalar_t, part_t>(
2401 envConst.getRawPtr(),
2402 ia_comm.getRawPtr(),
2403 graph_model_.getRawPtr(),
2424 machine_extent_wrap_around,
2425 machine_.getRawPtr());
2427 int myRank = comm_->getRank();
2429 this->
proc_task_comm->divide_to_prime_first = divide_to_prime_first;
2431 envConst->timerStart(
MACRO_TIMERS,
"Mapping - Processor Task map");
2433 envConst->timerStop(
MACRO_TIMERS,
"Mapping - Processor Task map");
2435 envConst->timerStart(
MACRO_TIMERS,
"Mapping - Communication Graph");
2442 envConst->timerStop(
MACRO_TIMERS,
"Mapping - Communication Graph");
2444 if (comm_->getRank() == 0) {
2447 std::cout <<
" TotalComm:"
2450 for (
part_t i = 1; i <= taskCommCount; ++i) {
2456 std::cout <<
" maxNeighbor:" << maxN << std::endl;
2459 this->writeGnuPlot(comm_, soln_, coordDim, partCenters);
2462 envConst->timerStart(
MACRO_TIMERS,
"Mapping - Communication Cost");
2477 envConst->timerStop(
MACRO_TIMERS,
"Mapping - Communication Cost");
2484 coordinateModel_->getLocalNumCoordinates(),
2507 delete [] machine_extent_wrap_around;
2509 if (machine_->getMachineExtent(machine_extent) &&
2511 for (
int i = 0; i < procDim; ++i) {
2512 delete [] procCoordinates[i];
2514 delete [] procCoordinates;
2517 for (
int i = 0; i < coordDim; ++i) {
2518 delete [] partCenters[i];
2520 delete [] partCenters;
2543 const Teuchos::RCP <
const Teuchos::Comm<int> > comm_,
2546 const Teuchos::RCP <const Adapter> input_adapter_,
2548 const part_t *result_parts,
2549 const Teuchos::RCP <const Environment> envConst,
2550 bool is_input_adapter_distributed =
true,
2551 int num_ranks_per_node = 1,
2552 bool divide_to_prime_first =
false,
2553 bool reduce_best_mapping =
true):
2555 num_parts_, result_parts, envConst),
2565 using namespace Teuchos;
2568 RCP<Zoltan2::GraphModel<ctm_base_adapter_t> > graph_model_;
2569 RCP<Zoltan2::CoordinateModel<ctm_base_adapter_t> > coordinateModel_ ;
2571 RCP<const Teuchos::Comm<int> > rcp_comm = comm_;
2572 RCP<const Teuchos::Comm<int> > ia_comm = rcp_comm;
2573 if (!is_input_adapter_distributed) {
2574 ia_comm = Teuchos::createSerialComm<int>();
2576 RCP<const Environment> envConst_ = envConst;
2578 RCP<const ctm_base_adapter_t> baseInputAdapter_(
2579 rcp(dynamic_cast<const ctm_base_adapter_t *>(
2580 input_adapter_.getRawPtr()),
false));
2588 baseInputAdapter_, envConst_, ia_comm, coordFlags_));
2598 baseInputAdapter_, envConst_, ia_comm,
2602 if (!machine_->hasMachineCoordinates()) {
2603 throw std::runtime_error(
"Existing machine does not provide "
2604 "coordinates for coordinate task mapping.");
2608 int procDim = machine_->getMachineDim();
2609 this->
nprocs = machine_->getNumRanks();
2612 pcoord_t **procCoordinates = NULL;
2613 if (!machine_->getAllMachineCoordinatesView(procCoordinates)) {
2614 throw std::runtime_error(
"Existing machine does not implement "
2615 "getAllMachineCoordinatesView");
2623 std::vector<int> machine_extent_vec(procDim);
2625 int *machine_extent = &(machine_extent_vec[0]);
2626 bool *machine_extent_wrap_around =
new bool[procDim];
2627 bool haveWrapArounds = machine_->getMachineExtentWrapArounds(machine_extent_wrap_around);
2633 if (machine_->getMachineExtent(machine_extent) &&
2639 machine_extent_wrap_around,
2646 int coordDim = coordinateModel_->getCoordinateDim();
2651 this->ntasks = num_parts_;
2655 tcoord_t **partCenters =
new tcoord_t*[coordDim];
2656 for (
int i = 0; i < coordDim; ++i) {
2657 partCenters[i] =
new tcoord_t[this->
ntasks];
2660 typedef typename Adapter::scalar_t t_scalar_t;
2663 envConst->timerStart(
MACRO_TIMERS,
"Mapping - Solution Center");
2666 getSolutionCenterCoordinates<Adapter, t_scalar_t,part_t>(
2667 envConst.getRawPtr(),
2668 ia_comm.getRawPtr(),
2669 coordinateModel_.getRawPtr(),
2677 envConst->timerStop(
MACRO_TIMERS,
"Mapping - Solution Center");
2681 if (graph_model_.getRawPtr() != NULL) {
2682 getCoarsenedPartGraph<Adapter, t_scalar_t, part_t>(
2683 envConst.getRawPtr(),
2684 ia_comm.getRawPtr(),
2685 graph_model_.getRawPtr(),
2698 "CoordinateCommunicationModel Create");
2709 machine_extent_wrap_around,
2710 machine_.getRawPtr());
2713 "CoordinateCommunicationModel Create");
2717 this->
proc_task_comm->divide_to_prime_first = divide_to_prime_first;
2719 int myRank = comm_->getRank();
2722 envConst->timerStart(
MACRO_TIMERS,
"Mapping - Processor Task map");
2724 envConst->timerStop(
MACRO_TIMERS,
"Mapping - Processor Task map");
2727 envConst->timerStart(
MACRO_TIMERS,
"Mapping - Communication Graph");
2734 envConst->timerStop(
MACRO_TIMERS,
"Mapping - Communication Graph");
2736 if (comm_->getRank() == 0) {
2739 std::cout <<
" TotalComm:"
2742 for (
part_t i = 1; i <= taskCommCount; ++i) {
2748 std::cout <<
" maxNeighbor:" << maxN << std::endl;
2751 this->writeGnuPlot(comm_, soln_, coordDim, partCenters);
2754 envConst->timerStart(
MACRO_TIMERS,
"Mapping - Communication Cost");
2769 envConst->timerStop(
MACRO_TIMERS,
"Mapping - Communication Cost");
2777 coordinateModel_->getLocalNumCoordinates(),
2801 delete [] machine_extent_wrap_around;
2803 if (machine_->getMachineExtent(machine_extent) &&
2805 for (
int i = 0; i < procDim; ++i) {
2806 delete [] procCoordinates[i];
2808 delete [] procCoordinates;
2811 for (
int i = 0; i < coordDim; ++i) {
2812 delete [] partCenters[i];
2814 delete [] partCenters;
2884 const Teuchos::Comm<int> *problemComm,
2887 pcoord_t **machine_coords,
2890 tcoord_t **task_coords,
2891 ArrayRCP<part_t>task_comm_xadj,
2892 ArrayRCP<part_t>task_comm_adj,
2893 pcoord_t *task_communication_edge_weight_,
2894 int recursion_depth,
2895 Kokkos::View<part_t *, Kokkos::HostSpace> part_no_array,
2896 const part_t *machine_dimensions,
2897 int num_ranks_per_node = 1,
2898 bool divide_to_prime_first =
false,
2899 bool reduce_best_mapping =
true):
2901 Teuchos::rcpFromRef<const Teuchos::Comm<int> >(*problemComm),
2902 Teuchos::rcpFromRef<const
Environment>(*env_const_)),
2912 pcoord_t ** virtual_machine_coordinates = machine_coords;
2913 bool *wrap_arounds =
new bool [proc_dim];
2914 for (
int i = 0; i < proc_dim; ++i) wrap_arounds[i] =
true;
2916 if (machine_dimensions) {
2917 virtual_machine_coordinates =
2926 this->
nprocs = num_processors;
2928 int coordDim = task_dim;
2929 this->ntasks = num_tasks;
2932 tcoord_t **partCenters = task_coords;
2938 virtual_machine_coordinates,
2946 this->
proc_task_comm->divide_to_prime_first = divide_to_prime_first;
2951 int myRank = problemComm->getRank();
2966 task_communication_edge_weight_
2987 delete [] wrap_arounds;
2989 if (machine_dimensions) {
2990 for (
int i = 0; i < proc_dim; ++i) {
2991 delete [] virtual_machine_coordinates[i];
2993 delete [] virtual_machine_coordinates;
2996 if (problemComm->getRank() == 0)
3026 const part_t *machine_dimensions,
3027 bool *machine_extent_wrap_around,
3029 pcoord_t **mCoords) {
3031 pcoord_t **result_machine_coords = NULL;
3032 result_machine_coords =
new pcoord_t*[machine_dim];
3034 for (
int i = 0; i < machine_dim; ++i) {
3035 result_machine_coords[i] =
new pcoord_t [numProcs];
3038 for (
int i = 0; i < machine_dim; ++i) {
3039 part_t numMachinesAlongDim = machine_dimensions[i];
3041 part_t *machineCounts =
new part_t[numMachinesAlongDim];
3042 memset(machineCounts, 0,
sizeof(
part_t) * numMachinesAlongDim);
3044 int *filledCoordinates =
new int[numMachinesAlongDim];
3046 pcoord_t *coords = mCoords[i];
3048 for (
part_t j = 0; j < numProcs; ++j) {
3050 ++machineCounts[mc];
3053 part_t filledCoordinateCount = 0;
3054 for (
part_t j = 0; j < numMachinesAlongDim; ++j) {
3055 if (machineCounts[j] > 0) {
3056 filledCoordinates[filledCoordinateCount++] = j;
3060 part_t firstProcCoord = filledCoordinates[0];
3061 part_t firstProcCount = machineCounts[firstProcCoord];
3063 part_t lastProcCoord = filledCoordinates[filledCoordinateCount - 1];
3064 part_t lastProcCount = machineCounts[lastProcCoord];
3067 numMachinesAlongDim - lastProcCoord + firstProcCoord;
3068 part_t firstLastGapProc = lastProcCount + firstProcCount;
3070 part_t leftSideProcCoord = firstProcCoord;
3071 part_t leftSideProcCount = firstProcCount;
3073 part_t biggestGapProc = numProcs;
3075 part_t shiftBorderCoordinate = -1;
3076 for (
part_t j = 1; j < filledCoordinateCount; ++j) {
3077 part_t rightSideProcCoord= filledCoordinates[j];
3078 part_t rightSideProcCount = machineCounts[rightSideProcCoord];
3080 part_t gap = rightSideProcCoord - leftSideProcCoord;
3081 part_t gapProc = rightSideProcCount + leftSideProcCount;
3087 if (gap > biggestGap ||
3088 (gap == biggestGap && biggestGapProc > gapProc)) {
3089 shiftBorderCoordinate = rightSideProcCoord;
3090 biggestGapProc = gapProc;
3093 leftSideProcCoord = rightSideProcCoord;
3094 leftSideProcCount = rightSideProcCount;
3098 if (!(biggestGap > firstLastGap ||
3099 (biggestGap == firstLastGap &&
3100 biggestGapProc < firstLastGapProc))) {
3101 shiftBorderCoordinate = -1;
3104 for (
part_t j = 0; j < numProcs; ++j) {
3106 if (machine_extent_wrap_around[i] &&
3107 coords[j] < shiftBorderCoordinate) {
3108 result_machine_coords[i][j] = coords[j] + numMachinesAlongDim;
3112 result_machine_coords[i][j] = coords[j];
3115 delete [] machineCounts;
3116 delete [] filledCoordinates;
3119 return result_machine_coords;
3158 numParts = taskend - task_begin;
3174 if (taskend - task_begin > 0) {
3175 ArrayView <part_t> assignedParts(
3177 taskend - task_begin);
3179 return assignedParts;
3182 ArrayView <part_t> assignedParts;
3184 return assignedParts;
3268 template <
typename part_t,
typename pcoord_t,
typename tcoord_t>
3270 RCP<
const Teuchos::Comm<int> > problemComm,
3273 pcoord_t **machine_coords,
3276 tcoord_t **task_coords,
3281 pcoord_t *task_communication_edge_weight_,
3284 int recursion_depth,
3285 Kokkos::View<part_t *, Kokkos::HostSpace> part_no_array,
3286 const part_t *machine_dimensions,
3287 int num_ranks_per_node = 1,
3288 bool divide_to_prime_first =
false) {
3295 typedef Tpetra::MultiVector<tcoord_t, part_t, part_t>
tMVector_t;
3298 task_comm_xadj, 0, num_tasks + 1,
false);
3301 if (task_comm_xadj) {
3302 Teuchos::ArrayRCP<part_t> tmp_task_communication_adj(
3303 task_comm_adj, 0, task_comm_xadj[num_tasks],
false);
3304 task_communication_adj = tmp_task_communication_adj;
3311 problemComm.getRawPtr(),
3323 task_communication_edge_weight_,
3328 divide_to_prime_first);
3331 part_t* proc_to_task_xadj_;
3332 part_t* proc_to_task_adj_;
3334 ctm->
getProcTask(proc_to_task_xadj_, proc_to_task_adj_);
3336 for (
part_t i = 0; i <= num_processors; ++i) {
3337 proc_to_task_xadj[i] = proc_to_task_xadj_[i];
3340 for (
part_t i = 0; i < num_tasks; ++i) {
3341 proc_to_task_adj[i] = proc_to_task_adj_[i];
3348 template <
typename proc_coord_t,
typename v_lno_t>
3350 const int machine_coord_dim,
3351 const int num_ranks,
3352 proc_coord_t **machine_coords,
3353 const v_lno_t num_tasks,
3356 const int *task_to_rank) {
3358 std::string rankStr = Teuchos::toString<int>(myRank);
3359 std::string gnuPlots =
"gnuPlot", extentionS =
".plot";
3360 std::string outF = gnuPlots + rankStr+ extentionS;
3361 std::ofstream gnuPlotCode( outF.c_str(), std::ofstream::out);
3363 if (machine_coord_dim != 3) {
3364 std::cerr <<
"Mapping Write is only good for 3 dim" << std::endl;
3367 std::string ss =
"";
3368 std::string procs =
"";
3370 std::set<std::tuple<int, int, int, int, int, int> > my_arrows;
3372 for (v_lno_t origin_task = 0; origin_task < num_tasks; ++origin_task) {
3373 int origin_rank = task_to_rank[origin_task];
3374 std::string gnuPlotArrow =
"set arrow from ";
3376 for (
int j = 0; j < machine_coord_dim; ++j) {
3377 if (j == machine_coord_dim - 1) {
3379 Teuchos::toString<proc_coord_t>(machine_coords[j][origin_rank]);
3381 Teuchos::toString<proc_coord_t>(machine_coords[j][origin_rank]);
3386 Teuchos::toString<proc_coord_t>(machine_coords[j][origin_rank])
3389 Teuchos::toString<proc_coord_t>(machine_coords[j][origin_rank])
3395 gnuPlotArrow +=
" to ";
3398 for (
int nind = task_communication_xadj[origin_task];
3399 nind < task_communication_xadj[origin_task + 1]; ++nind) {
3401 int neighbor_task = task_communication_adj[nind];
3403 bool differentnode =
false;
3404 int neighbor_rank = task_to_rank[neighbor_task];
3406 for (
int j = 0; j < machine_coord_dim; ++j) {
3407 if (
int(machine_coords[j][origin_rank]) !=
3408 int(machine_coords[j][neighbor_rank])) {
3409 differentnode =
true;
break;
3413 std::tuple<int,int,int, int, int, int> foo(
3414 (
int)(machine_coords[0][origin_rank]),
3415 (
int)(machine_coords[1][origin_rank]),
3416 (
int)(machine_coords[2][origin_rank]),
3417 (
int)(machine_coords[0][neighbor_rank]),
3418 (
int)(machine_coords[1][neighbor_rank]),
3419 (
int)(machine_coords[2][neighbor_rank]));
3421 if (differentnode && my_arrows.find(foo) == my_arrows.end()) {
3422 my_arrows.insert(foo);
3424 std::string gnuPlotArrow2 =
"";
3425 for (
int j = 0; j < machine_coord_dim; ++j) {
3426 if (j == machine_coord_dim - 1) {
3428 Teuchos::toString<float>(machine_coords[j][neighbor_rank]);
3432 Teuchos::toString<float>(machine_coords[j][neighbor_rank])
3436 ss += gnuPlotArrow + gnuPlotArrow2 +
" nohead\n";
3441 std::ofstream procFile(
"procPlot.plot", std::ofstream::out);
3442 procFile << procs <<
"\n";
3446 if (machine_coord_dim == 2) {
3447 gnuPlotCode <<
"plot \"procPlot.plot\" with points pointsize 3\n";
3450 gnuPlotCode <<
"splot \"procPlot.plot\" with points pointsize 3\n";
3453 gnuPlotCode << ss <<
"\nreplot\n pause -1\npause -1";
3454 gnuPlotCode.close();
void setParams(int dimension_, int heapsize)
CommunicationModel Base Class that performs mapping between the coordinate partitioning result...
size_t getLocalNumEdges() const
Returns the number of edges on this process. In global or subset graphs, includes off-process edges...
Zoltan2::BaseAdapter< userTypes_t > base_adapter_t
virtual void getMapping(int myRank, const RCP< const Environment > &env, ArrayRCP< part_t > &rcp_proc_to_task_xadj, ArrayRCP< part_t > &rcp_proc_to_task_adj, ArrayRCP< part_t > &rcp_task_to_proc, const Teuchos::RCP< const Teuchos::Comm< int > > comm_) const
Function is called whenever nprocs > no_task. Function returns only the subset of processors that are...
void getBestMapping()
finds the lowest cost mapping and broadcasts solution to everyone.
void setPartArraySize(int psize)
void ithPermutation(const IT n, IT i, IT *perm)
CoordinateTaskMapper(const Teuchos::RCP< const Teuchos::Comm< int > > comm_, const Teuchos::RCP< const MachineRepresentation< pcoord_t, part_t > > machine_, const Teuchos::RCP< const Adapter > input_adapter_, const part_t num_parts_, const part_t *result_parts, const Teuchos::RCP< const Environment > envConst, bool is_input_adapter_distributed=true, int num_ranks_per_node=1, bool divide_to_prime_first=false, bool reduce_best_mapping=true)
Constructor. Instead of Solution we have two parameters, numparts.
size_t getVertexList(ArrayView< const gno_t > &Ids, ArrayView< input_t > &wgts) const
Sets pointers to this process' vertex Ids and their weights.
Time an algorithm (or other entity) as a whole.
WT getDistance(IT index, WT **elementCoords)
virtual size_t getLocalNumberOfParts() const
Returns the number of parts to be assigned to this process.
void getCoarsenedPartGraph(const Environment *envConst, const Teuchos::Comm< int > *comm, const Zoltan2::GraphModel< typename Adapter::base_adapter_t > *graph, part_t np, const part_t *parts, ArrayRCP< part_t > &g_part_xadj, ArrayRCP< part_t > &g_part_adj, ArrayRCP< scalar_t > &g_part_ew)
void visualize_mapping(int myRank, const int machine_coord_dim, const int num_ranks, proc_coord_t **machine_coords, const v_lno_t num_tasks, const v_lno_t *task_communication_xadj, const v_lno_t *task_communication_adj, const int *task_to_rank)
PartitionMapping maps a solution or an input distribution to ranks.
ArrayRCP< part_t > local_task_to_rank
virtual double getProcDistance(int procId1, int procId2) const
void getSolutionCenterCoordinates(const Environment *envConst, const Teuchos::Comm< int > *comm, const Zoltan2::CoordinateModel< typename Adapter::base_adapter_t > *coords, const part_t *parts, int coordDim, part_t ntasks, scalar_t **partCenters)
bool getNewCenters(WT **coords)
void calculateCommunicationCost(part_t *task_to_proc, part_t *task_communication_xadj, part_t *task_communication_adj, pcoord_t *task_communication_edge_weight)
std::bitset< NUM_MODEL_FLAGS > modelFlag_t
map_t::global_ordinal_type gno_t
virtual ~CoordinateTaskMapper()
const part_t * solution_parts
ArrayView< part_t > getAssignedTasksForProc(part_t procId)
bool * machine_extent_wrap_around
part_t getAssignedProcForTask(part_t taskId)
getAssignedProcForTask function, returns the assigned processor id for the given task ...
void getMinDistanceCluster(IT *procPermutation)
void doMapping(int myRank, const Teuchos::RCP< const Teuchos::Comm< int > > comm_)
doMapping function, calls getMapping function of communicationModel object.
virtual std::vector< coordinateModelPartBox > & getPartBoxesView() const
for partitioning methods, return bounding boxes of the
Defines the XpetraMultiVectorAdapter.
const MachineRepresentation< pcoord_t, part_t > * machine
SparseMatrixAdapter_t::part_t part_t
Multi Jagged coordinate partitioning algorithm.
void create_local_task_to_rank(const lno_t num_local_coords, const part_t *local_coord_parts, const ArrayRCP< part_t > task_to_proc_)
Kokkos::View< part_t *, Kokkos::HostSpace > kokkos_partNoArray
void timerStop(TimerType tt, const char *timerName) const
Stop a named timer.
This class provides geometric coordinates with optional weights to the Zoltan2 algorithm.
double getCommunicationCostMetric()
virtual void map(const RCP< MappingSolution< Adapter > > &mappingsoln)
Mapping method.
CoordinateTaskMapper(const Environment *env_const_, const Teuchos::Comm< int > *problemComm, int proc_dim, int num_processors, pcoord_t **machine_coords, int task_dim, part_t num_tasks, tcoord_t **task_coords, ArrayRCP< part_t >task_comm_xadj, ArrayRCP< part_t >task_comm_adj, pcoord_t *task_communication_edge_weight_, int recursion_depth, Kokkos::View< part_t *, Kokkos::HostSpace > part_no_array, const part_t *machine_dimensions, int num_ranks_per_node=1, bool divide_to_prime_first=false, bool reduce_best_mapping=true)
Constructor The mapping constructor which will also perform the mapping operation. The result mapping can be obtained by –getAssignedProcForTask function: which returns the assigned processor id for the given task –getPartsForProc: which returns the assigned tasks with the number of tasks.
Contains the Multi-jagged algorthm.
Adapter::scalar_t scalar_t
size_t getCoordinates(ArrayView< const gno_t > &Ids, ArrayView< input_t > &xyz, ArrayView< input_t > &wgts) const
Returns the coordinate ids, values and optional weights.
PartitionMapping maps a solution or an input distribution to ranks.
void getProcTask(part_t *&proc_to_task_xadj_, part_t *&proc_to_task_adj_)
void update_visit_order(part_t *visitOrder, part_t n, int &_u_umpa_seed, part_t rndm)
A PartitioningSolution is a solution to a partitioning problem.
CoordinateCommunicationModel< pcoord_t, tcoord_t, part_t, node_t > * proc_task_comm
virtual void getProcsForPart(part_t taskId, part_t &numProcs, part_t *&procs) const
getAssignedProcForTask function, returns the assigned tasks with the number of tasks.
size_t getEdgeList(ArrayView< const gno_t > &edgeIds, ArrayView< const offset_t > &offsets, ArrayView< input_t > &wgts) const
Sets pointers to this process' edge (neighbor) global Ids, including off-process edges.
const part_t * getPartListView() const
Returns the part list corresponding to the global ID list.
void copyCoordinates(IT *permutation)
void getGridCommunicationGraph(part_t taskCount, part_t *&task_comm_xadj, part_t *&task_comm_adj, std::vector< int > grid_dims)
Zoltan2_ReduceBestMapping Class, reduces the minimum cost mapping, ties breaks with minimum proc id...
ArrayRCP< part_t > task_communication_adj
BaseAdapterType
An enum to identify general types of adapters.
void setPartArray(Kokkos::View< part_t *, Kokkos::HostSpace > pNo)
void reduce(const Ordinal count, const T inBuffer[], T inoutBuffer[]) const
Implement Teuchos::ValueTypeReductionOp interface.
The StridedData class manages lists of weights or coordinates.
CoordinateCommunicationModel()
map_t::local_ordinal_type lno_t
ArrayRCP< part_t > proc_to_task_adj
virtual void getMapping(int myRank, const RCP< const Environment > &env, ArrayRCP< part_t > &proc_to_task_xadj, ArrayRCP< part_t > &proc_to_task_adj, ArrayRCP< part_t > &task_to_proc, const Teuchos::RCP< const Teuchos::Comm< int > > comm_) const =0
Function is called whenever nprocs > no_task. Function returns only the subset of processors that are...
The user parameters, debug, timing and memory profiling output objects, and error checking methods...
void getClosestSubset(part_t *proc_permutation, part_t nprocs, part_t ntasks) const
Function is called whenever nprocs > no_task. Function returns only the subset of processors that are...
void addPoint(IT index, WT distance)
bool getNewCenters(WT *center, WT **coords, int dimension)
size_t getLocalNumVertices() const
Returns the number vertices on this process.
MachineRepresentation Class Base class for representing machine coordinates, networks, etc.
CommunicationModel(part_t no_procs_, part_t no_tasks_)
static part_t umpa_uRandom(part_t l, int &_u_umpa_seed)
size_t getLocalNumCoordinates() const
Returns the number of coordinates on this process.
CoordinateTaskMapper(const Teuchos::RCP< const Teuchos::Comm< int > > comm_, const Teuchos::RCP< const MachineRepresentation< pcoord_t, part_t > > machine_, const Teuchos::RCP< const Adapter > input_adapter_, const Teuchos::RCP< const Zoltan2::PartitioningSolution< Adapter > > soln_, const Teuchos::RCP< const Environment > envConst, bool is_input_adapter_distributed=true, int num_ranks_per_node=1, bool divide_to_prime_first=false, bool reduce_best_mapping=true)
Constructor. When this constructor is called, in order to calculate the communication metric...
virtual ~CoordinateCommunicationModel()
CoordinateModelInput Class that performs mapping between the coordinate partitioning result and mpi r...
Traits class to handle conversions between gno_t/lno_t and TPL data types (e.g., ParMETIS's idx_t...
pcoord_t ** shiftMachineCoordinates(int machine_dim, const part_t *machine_dimensions, bool *machine_extent_wrap_around, part_t numProcs, pcoord_t **mCoords)
Using the machine dimensions provided, create virtual machine coordinates by assigning the largest ga...
Defines the MappingSolution class.
const Teuchos::RCP< const Teuchos::Comm< int > > comm
CoordinateCommunicationModel(int pcoord_dim_, pcoord_t **pcoords_, int tcoord_dim_, tcoord_t **tcoords_, part_t no_procs_, part_t no_tasks_, int *machine_extent_, bool *machine_extent_wrap_around_, const MachineRepresentation< pcoord_t, part_t > *machine_=NULL)
Class Constructor:
bool divide_to_prime_first
virtual ~CommunicationModel()
ArrayRCP< part_t > task_communication_xadj
GraphModel defines the interface required for graph models.
void coordinateTaskMapperInterface(RCP< const Teuchos::Comm< int > > problemComm, int proc_dim, int num_processors, pcoord_t **machine_coords, int task_dim, part_t num_tasks, tcoord_t **task_coords, part_t *task_comm_xadj, part_t *task_comm_adj, pcoord_t *task_communication_edge_weight_, part_t *proc_to_task_xadj, part_t *proc_to_task_adj, int recursion_depth, Kokkos::View< part_t *, Kokkos::HostSpace > part_no_array, const part_t *machine_dimensions, int num_ranks_per_node=1, bool divide_to_prime_first=false)
Constructor The interface function that calls CoordinateTaskMapper which will also perform the mappin...
Gathering definitions used in software development.
Zoltan2_ReduceBestMapping()
Default Constructor.
KmeansHeap Class, max heap, but holds the minimum values.
int getNumWeightsPerEdge() const
Returns the number (0 or greater) of weights per edge.
Defines the GraphModel interface.
void fillContinousArray(T *arr, size_t arrSize, T *val)
fillContinousArray function
void push_down(IT index_on_heap)
ArrayRCP< scalar_t > task_communication_edge_weight
RCP< Comm< int > > create_subCommunicator()
creates and returns the subcommunicator for the processor group.
size_t getActualGlobalNumberOfParts() const
Returns the actual global number of parts provided in setParts().
KMeansAlgorithm Class that performs clustering of the coordinates, and returns the closest set of coo...
const Teuchos::RCP< const Environment > env
void timerStart(TimerType tt, const char *timerName) const
Start a named timer.
ArrayRCP< part_t > task_to_proc
KMeansAlgorithm(int dim_, IT numElements_, WT **elementCoords_, IT required_elements_)
KMeansAlgorithm Constructor.
Tpetra::MultiVector< zscalar_t, zlno_t, zgno_t, znode_t > tMVector_t
void writeMapping2(int myRank)
void copyCoordinates(IT *permutation)
size_t getTargetGlobalNumberOfParts() const
Returns the global number of parts desired in the solution.
virtual double getProcDistance(int procId1, int procId2) const =0
virtual void getPartsForProc(int procId, part_t &numParts, part_t *&parts) const
getAssignedProcForTask function, returns the assigned tasks with the number of tasks.
ArrayRCP< part_t > proc_to_task_xadj
void sequential_task_partitioning(const RCP< const Environment > &env, mj_lno_t num_total_coords, mj_lno_t num_selected_coords, size_t num_target_part, int coord_dim, Kokkos::View< mj_scalar_t **, Kokkos::LayoutLeft, device_t > &mj_coordinates_, Kokkos::View< mj_lno_t *, device_t > &initial_selected_coords_output_permutation, mj_lno_t *output_xadj, int recursion_depth_, const Kokkos::View< mj_part_t *, Kokkos::HostSpace > &part_no_array, bool partition_along_longest_dim, int num_ranks_per_node, bool divide_to_prime_first_, mj_part_t num_first_level_parts_=1, const Kokkos::View< mj_part_t *, Kokkos::HostSpace > &first_level_distribution_=Kokkos::View< mj_part_t *, Kokkos::HostSpace >())
Special function for partitioning for task mapping. Runs sequential, and performs deterministic parti...
void setHeapsize(IT heapsize_)